Import Upstream version 0.14
This commit is contained in:
commit
f03dd0cdde
|
@ -0,0 +1,63 @@
|
|||
0.14 2021-01-31
|
||||
- Lexical $_ has been removed (Perl 5.24) (PR #1, thanks Tim Heaney)
|
||||
- Expose the NamedCapturingGroups method (PR #2, thanks rouzier)
|
||||
- Fix build on macOS by defaulting RE2 to use C++11 ("tr1/unordered_set" isn't provided by the clang C++ library
|
||||
anymore and it's 2021)
|
||||
- Link to GitHub issue tracker as CPAN RT is going away.
|
||||
|
||||
0.13 2015-01-18
|
||||
- Unbreak Windows build
|
||||
- Up minimum perl to 5.12
|
||||
|
||||
0.12 2015-01-17
|
||||
- Fix for `"" =~ {}` crashing
|
||||
- Fix for building on perl >= 5.20 (RT #95144, thanks Tony C. for the patch)
|
||||
- Fix build with -Werror=format-security (RT #96338)
|
||||
|
||||
0.11 2012-07-29
|
||||
- Support named capture groups
|
||||
- Support perl >= 5.17.1; add a nulled out op_comp to engine struct
|
||||
|
||||
0.10 2012-07-24
|
||||
- Add missing compat-rx.h file
|
||||
|
||||
0.09 2012-04-01 (Brought to you from the 2012 QA Hackathon in Paris)
|
||||
- Thread destruction fixes
|
||||
|
||||
0.08 2011-04-18
|
||||
- Add files I forgot to add
|
||||
|
||||
0.07_01 2011-04-16
|
||||
- Use cophh API
|
||||
- Support -strict mode
|
||||
|
||||
0.07 2011-04-11
|
||||
- RT #67192: Fix /s support
|
||||
- Attempt to compile with -O3 as RE2 does
|
||||
- Fix leak in possible_match_range
|
||||
- Fix compilation on gcc 4.6 (RE2 issue 35)
|
||||
|
||||
0.06 2011-04-02
|
||||
- RT #67153: Fix interpolation of RE2 into RE2
|
||||
(qr// stringification included the x flag which RE2 doesn't support)
|
||||
|
||||
0.05 2011-02-06
|
||||
- Allow setting of RE2's max_mem to control memory bound
|
||||
- Improve documentation
|
||||
|
||||
0.04 2011-01-29
|
||||
- Remove various UNIXisms from RE2, now builds under Win32/Strawberry
|
||||
(still needs gmake installed, which comes with Strawberry)
|
||||
|
||||
0.03 2011-01-23
|
||||
- Pass more options from MakeMaker to RE2, should now work on x86_64 again
|
||||
- Run RE2's own test suite as part of build if we can
|
||||
|
||||
0.02 2011-01-22
|
||||
- Use ExtUtils::CppGuess and try to find GNU make
|
||||
|
||||
0.01 2011-01-16
|
||||
- Fixes for //g, captures, generally many things
|
||||
|
||||
0.01_01 2010-07-25
|
||||
- Initial dev. version
|
|
@ -0,0 +1,143 @@
|
|||
Changes
|
||||
compat-cophh.h
|
||||
compat-rx.h
|
||||
lib/re/engine/RE2.pm
|
||||
Makefile.PL
|
||||
MANIFEST This list of files
|
||||
MANIFEST.SKIP
|
||||
ppport.h
|
||||
RE2.xs
|
||||
re2/.hgignore
|
||||
re2/AUTHORS
|
||||
re2/CONTRIBUTORS
|
||||
re2/libre2.symbols
|
||||
re2/libre2.symbols.darwin
|
||||
re2/LICENSE
|
||||
re2/Makefile
|
||||
re2/re2/bitstate.cc
|
||||
re2/re2/compile.cc
|
||||
re2/re2/dfa.cc
|
||||
re2/re2/filtered_re2.cc
|
||||
re2/re2/filtered_re2.h
|
||||
re2/re2/make_perl_groups.pl
|
||||
re2/re2/make_unicode_casefold.py
|
||||
re2/re2/make_unicode_groups.py
|
||||
re2/re2/Makefile
|
||||
re2/re2/mimics_pcre.cc
|
||||
re2/re2/nfa.cc
|
||||
re2/re2/onepass.cc
|
||||
re2/re2/parse.cc
|
||||
re2/re2/perl_groups.cc
|
||||
re2/re2/prefilter.cc
|
||||
re2/re2/prefilter.h
|
||||
re2/re2/prefilter_tree.cc
|
||||
re2/re2/prefilter_tree.h
|
||||
re2/re2/prog.cc
|
||||
re2/re2/prog.h
|
||||
re2/re2/re2.cc
|
||||
re2/re2/re2.h
|
||||
re2/re2/regexp.cc
|
||||
re2/re2/regexp.h
|
||||
re2/re2/set.cc
|
||||
re2/re2/set.h
|
||||
re2/re2/simplify.cc
|
||||
re2/re2/stringpiece.h
|
||||
re2/re2/testing/backtrack.cc
|
||||
re2/re2/testing/charclass_test.cc
|
||||
re2/re2/testing/compile_test.cc
|
||||
re2/re2/testing/dfa_test.cc
|
||||
re2/re2/testing/dump.cc
|
||||
re2/re2/testing/exhaustive1_test.cc
|
||||
re2/re2/testing/exhaustive2_test.cc
|
||||
re2/re2/testing/exhaustive3_test.cc
|
||||
re2/re2/testing/exhaustive_test.cc
|
||||
re2/re2/testing/exhaustive_tester.cc
|
||||
re2/re2/testing/exhaustive_tester.h
|
||||
re2/re2/testing/filtered_re2_test.cc
|
||||
re2/re2/testing/mimics_pcre_test.cc
|
||||
re2/re2/testing/null_walker.cc
|
||||
re2/re2/testing/parse_test.cc
|
||||
re2/re2/testing/possible_match_test.cc
|
||||
re2/re2/testing/random_test.cc
|
||||
re2/re2/testing/re2_arg_test.cc
|
||||
re2/re2/testing/re2_test.cc
|
||||
re2/re2/testing/regexp_benchmark.cc
|
||||
re2/re2/testing/regexp_generator.cc
|
||||
re2/re2/testing/regexp_generator.h
|
||||
re2/re2/testing/regexp_test.cc
|
||||
re2/re2/testing/required_prefix_test.cc
|
||||
re2/re2/testing/search_test.cc
|
||||
re2/re2/testing/set_test.cc
|
||||
re2/re2/testing/simplify_test.cc
|
||||
re2/re2/testing/string_generator.cc
|
||||
re2/re2/testing/string_generator.h
|
||||
re2/re2/testing/string_generator_test.cc
|
||||
re2/re2/testing/tester.cc
|
||||
re2/re2/testing/tester.h
|
||||
re2/re2/testing/unicode_test.py
|
||||
re2/re2/tostring.cc
|
||||
re2/re2/unicode.py
|
||||
re2/re2/unicode_casefold.cc
|
||||
re2/re2/unicode_casefold.h
|
||||
re2/re2/unicode_groups.cc
|
||||
re2/re2/unicode_groups.h
|
||||
re2/re2/variadic_function.h
|
||||
re2/re2/walker-inl.h
|
||||
re2/README
|
||||
re2/runtests
|
||||
re2/testinstall.cc
|
||||
re2/util/arena.cc
|
||||
re2/util/arena.h
|
||||
re2/util/atomicops.h
|
||||
re2/util/benchmark.cc
|
||||
re2/util/benchmark.h
|
||||
re2/util/flags.h
|
||||
re2/util/hash.cc
|
||||
re2/util/logging.h
|
||||
re2/util/mutex.h
|
||||
re2/util/pcre.cc
|
||||
re2/util/pcre.h
|
||||
re2/util/random.cc
|
||||
re2/util/random.h
|
||||
re2/util/rune.cc
|
||||
re2/util/sparse_array.h
|
||||
re2/util/sparse_array_test.cc
|
||||
re2/util/sparse_set.h
|
||||
re2/util/stringpiece.cc
|
||||
re2/util/stringprintf.cc
|
||||
re2/util/strutil.cc
|
||||
re2/util/test.cc
|
||||
re2/util/test.h
|
||||
re2/util/thread.cc
|
||||
re2/util/thread.h
|
||||
re2/util/utf.h
|
||||
re2/util/util.h
|
||||
re2/util/valgrind.cc
|
||||
re2/util/valgrind.h
|
||||
re2_xs.cc
|
||||
re2_xs.h
|
||||
README
|
||||
t/00.compile.t
|
||||
t/00.re2-tests.t
|
||||
t/01.basic.t
|
||||
t/02.chars.t
|
||||
t/03.modifiers.t
|
||||
t/04.multiline.t
|
||||
t/05.url.t
|
||||
t/06.matchrange.t
|
||||
t/07.utf8.t
|
||||
t/08.pos.t
|
||||
t/09.mem.t
|
||||
t/10.options.t
|
||||
t/ree-pcre/capture.t
|
||||
t/ree-pcre/import.t
|
||||
t/ree-pcre/match.t
|
||||
t/ree-pcre/qr.t
|
||||
t/ree-pcre/s.t
|
||||
t/ree-pcre/split-null.t
|
||||
t/ree-pcre/split.t
|
||||
t/ree-pcre/subexp.t
|
||||
t/ree-pcre/unimport.t
|
||||
TODO
|
||||
META.yml Module YAML meta-data (added by MakeMaker)
|
||||
META.json Module JSON meta-data (added by MakeMaker)
|
|
@ -0,0 +1,46 @@
|
|||
#!start included /Users/dgl/.perl5/lib/perl5/ExtUtils/MANIFEST.SKIP
|
||||
# Avoid version control files.
|
||||
\bRCS\b
|
||||
\bCVS\b
|
||||
\bSCCS\b
|
||||
,v$
|
||||
\B\.svn\b
|
||||
\B\.git\b
|
||||
\B\.gitignore\b
|
||||
\b_darcs\b
|
||||
|
||||
# Avoid Makemaker generated and utility files.
|
||||
\bMANIFEST\.bak
|
||||
^Makefile$
|
||||
\bblib/
|
||||
\bMakeMaker-\d
|
||||
\bpm_to_blib\.ts$
|
||||
\bpm_to_blib$
|
||||
\bblibdirs\.ts$ # 6.18 through 6.25 generated this
|
||||
|
||||
# Avoid Module::Build generated and utility files.
|
||||
\bBuild$
|
||||
\b_build/
|
||||
|
||||
# Avoid temp and backup files.
|
||||
~$
|
||||
\.old$
|
||||
\#$
|
||||
\b\.#
|
||||
\.bak$
|
||||
|
||||
# Avoid Devel::Cover files.
|
||||
\bcover_db\b
|
||||
#!end included /Users/dgl/.perl5/lib/perl5/ExtUtils/MANIFEST.SKIP
|
||||
|
||||
.*\.o$
|
||||
.*\.c$
|
||||
.*\.bs$
|
||||
.*\.gz$
|
||||
.*\.tar$
|
||||
|
||||
.*\.so$
|
||||
MYMETA.yml
|
||||
|
||||
^misc/
|
||||
^re2/obj/
|
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"abstract" : "RE2 regex engine",
|
||||
"author" : [
|
||||
"David Leadbeater <dgl@dgl.cx>"
|
||||
],
|
||||
"dynamic_config" : 1,
|
||||
"generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010",
|
||||
"license" : [
|
||||
"perl_5"
|
||||
],
|
||||
"meta-spec" : {
|
||||
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
|
||||
"version" : 2
|
||||
},
|
||||
"name" : "re-engine-RE2",
|
||||
"no_index" : {
|
||||
"directory" : [
|
||||
"t",
|
||||
"inc"
|
||||
]
|
||||
},
|
||||
"prereqs" : {
|
||||
"build" : {
|
||||
"requires" : {
|
||||
"ExtUtils::MakeMaker" : "0"
|
||||
}
|
||||
},
|
||||
"configure" : {
|
||||
"requires" : {
|
||||
"ExtUtils::CppGuess" : "0",
|
||||
"Test::More" : "0.88"
|
||||
}
|
||||
}
|
||||
},
|
||||
"release_status" : "stable",
|
||||
"resources" : {
|
||||
"bugtracker" : {
|
||||
"web" : "https://github.com/dgl/re-engine-RE2/issues"
|
||||
}
|
||||
},
|
||||
"version" : "0.14",
|
||||
"x_serialization_backend" : "JSON::PP version 2.97001"
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
---
|
||||
abstract: 'RE2 regex engine'
|
||||
author:
|
||||
- 'David Leadbeater <dgl@dgl.cx>'
|
||||
build_requires:
|
||||
ExtUtils::MakeMaker: '0'
|
||||
configure_requires:
|
||||
ExtUtils::CppGuess: '0'
|
||||
Test::More: '0.88'
|
||||
dynamic_config: 1
|
||||
generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 2.150010'
|
||||
license: perl
|
||||
meta-spec:
|
||||
url: http://module-build.sourceforge.net/META-spec-v1.4.html
|
||||
version: '1.4'
|
||||
name: re-engine-RE2
|
||||
no_index:
|
||||
directory:
|
||||
- t
|
||||
- inc
|
||||
resources:
|
||||
bugtracker: https://github.com/dgl/re-engine-RE2/issues
|
||||
version: '0.14'
|
||||
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
|
|
@ -0,0 +1,132 @@
|
|||
use 5.012;
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use Config;
|
||||
use ExtUtils::MakeMaker;
|
||||
use ExtUtils::CppGuess;
|
||||
|
||||
# TODO: Optionally use system libre2, via ExtUtils::Liblist?
|
||||
|
||||
my @objects = qw(RE2.o re2_xs.o re2/obj/libre2.a);
|
||||
|
||||
my $guess = ExtUtils::CppGuess->new;
|
||||
|
||||
my %opt = (
|
||||
NAME => 're::engine::RE2',
|
||||
AUTHOR => 'David Leadbeater <dgl@dgl.cx>',
|
||||
VERSION_FROM => 'lib/re/engine/RE2.pm',
|
||||
ABSTRACT_FROM => 'lib/re/engine/RE2.pm',
|
||||
LICENSE => 'perl',
|
||||
INC => '-Ire2',
|
||||
PMLIBDIRS => ["lib"],
|
||||
OBJECT => join(" ", @objects),
|
||||
test => {TESTS => 't/*.t t/ree-pcre/*.t'},
|
||||
CONFIGURE_REQUIRES => {
|
||||
"ExtUtils::CppGuess" => 0,
|
||||
"Test::More" => 0.88,
|
||||
},
|
||||
$guess->makemaker_options
|
||||
);
|
||||
|
||||
if(eval { ExtUtils::MakeMaker->VERSION(6.46) }) {
|
||||
$opt{META_MERGE} = {
|
||||
'meta-spec' => { version => 2 },
|
||||
resources => {
|
||||
repository => 'https://github.com/dgl/re-engine-RE2',
|
||||
bugtracker => {
|
||||
web => 'https://github.com/dgl/re-engine-RE2/issues',
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# If the user didn't explicitly provide optimisation settings, we'll try to do
|
||||
# it ourselves, but only for gcc.
|
||||
|
||||
my $cc = (map +(/^CC=(.*)/i), @ARGV)[0] || $Config{cc};
|
||||
if(!grep(/^OPTIMIZE=/i, @ARGV)
|
||||
and my $gcc_version = gcc_version($cc)) {
|
||||
say "Compiling on gcc $gcc_version";
|
||||
my $optimize = $Config{optimize};
|
||||
|
||||
if($gcc_version) {
|
||||
$optimize =~ s/-O[s0-2]/-O3/ and say "Optimize level set to -O3";
|
||||
}
|
||||
|
||||
# Attempt to work out if we have a gcc that is likely to support -flto.
|
||||
# This is probably a lot of work for a minimal gain, but it's worth a try.
|
||||
if($gcc_version >= 4.5) {
|
||||
my $try_optimize = "$optimize -flto";
|
||||
# Try to use this flag
|
||||
if(gcc_try(cc => $cc, %opt, OPTIMIZE => $try_optimize)) {
|
||||
$optimize = $try_optimize;
|
||||
}
|
||||
|
||||
# gcc 4.9 needs this otherwise it gets rid of nearly everything in libre2.a.
|
||||
$try_optimize = "$optimize -ffat-lto-objects";
|
||||
if(gcc_try(cc => $cc, %opt, OPTIMIZE => $try_optimize)) {
|
||||
$optimize = $try_optimize;
|
||||
}
|
||||
}
|
||||
|
||||
say "OPTIMIZE is now: $optimize";
|
||||
$opt{OPTIMIZE} = $optimize;
|
||||
}
|
||||
|
||||
if(defined $Config{usethreads} && $Config{usethreads} eq 'define') {
|
||||
if(defined $Config{i_pthread} && $Config{i_pthread} eq 'define') {
|
||||
$opt{DEFINE} = "-DHAVE_PTHREAD -pthread";
|
||||
} else {
|
||||
# For now this allows compilation under Win32/Strawberry, but might cause weird crashes on thread
|
||||
# destruction...
|
||||
$opt{DEFINE} = "-DNO_THREADS";
|
||||
}
|
||||
} else {
|
||||
$opt{DEFINE} = "-DNO_THREADS";
|
||||
}
|
||||
|
||||
# This is a bit hacky, RE2 makefile needs GNU make, for now we'll try to find
|
||||
# it, ideally should rewrite the RE2 makefile to not need GNU make.
|
||||
our $MAKE;
|
||||
for my $make(qw(make gmake)) {
|
||||
if(qx{$make --version 2>&1} =~ /GNU Make/i) {
|
||||
$MAKE = $make;
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
if(!$MAKE) {
|
||||
die "RE2 currently needs GNU Make, please install gmake.\n";
|
||||
}
|
||||
|
||||
WriteMakefile(%opt);
|
||||
|
||||
sub gcc_version {
|
||||
my($cc) = @_;
|
||||
my $gcc_out = qx{$cc -v 2>&1};
|
||||
# Just the first two digits
|
||||
$gcc_out =~ /gcc version (\d+\.\d+)/ ? $1 : 0;
|
||||
}
|
||||
|
||||
# This is highly gcc and unix specific, but that's where I care about
|
||||
# optimising this anyway.
|
||||
sub gcc_try {
|
||||
my(%opts) = @_;
|
||||
system "$opts{cc} $opts{CCFLAGS} $opts{OPTIMIZE} -c -o /dev/null /dev/null >/dev/null 2>&1";
|
||||
not $?;
|
||||
}
|
||||
|
||||
sub MY::postamble {
|
||||
return <<MAKE_FRAG;
|
||||
|
||||
RE2_FLAGS = CC="\$(CC)" CXXFLAGS="\$(CCFLAGS) \$(CCCDLFLAGS) \$(OPTIMIZE) \$(DEFINE) -DUSE_CXX0X" LDFLAGS="\$(OTHERLDFLAGS) \$(LDLOADLIBS)"
|
||||
|
||||
re2/obj/libre2.a: re2/Makefile
|
||||
$MAKE -C re2 obj/libre2.a \$(RE2_FLAGS)
|
||||
|
||||
re2-tests:
|
||||
$MAKE -C re2 static-test \$(RE2_FLAGS) LDFLAGS="\$(OTHERLDFLAGS) \$(LDLOADLIBS) -lm -lpthread"
|
||||
|
||||
MAKE_FRAG
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
#include "re2_xs.h"
|
||||
#include "ppport.h"
|
||||
|
||||
MODULE = re::engine::RE2 PACKAGE = re::engine::RE2
|
||||
PROTOTYPES: ENABLE
|
||||
|
||||
void
|
||||
ENGINE(...)
|
||||
PROTOTYPE:
|
||||
PPCODE:
|
||||
XPUSHs(sv_2mortal(newSViv(PTR2IV(&re2_engine))));
|
||||
|
||||
# Use a typemap for this maybe, especially if we add more methods like it!
|
||||
void
|
||||
possible_match_range(SV *self, STRLEN len = 10)
|
||||
PROTOTYPE:
|
||||
PPCODE:
|
||||
REGEXP* rx;
|
||||
SV *possible_min, *possible_max;
|
||||
|
||||
if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
|
||||
croak("qr// reference to a re::engine::RE2 instance required");
|
||||
rx = SvRX(self);
|
||||
|
||||
RE2_possible_match_range(aTHX_ rx, len, &possible_min, &possible_max);
|
||||
|
||||
mXPUSHs(possible_min);
|
||||
mXPUSHs(possible_max);
|
||||
|
||||
HV*
|
||||
named_captures(SV *self)
|
||||
PROTOTYPE:
|
||||
CODE:
|
||||
REGEXP* rx;
|
||||
if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
|
||||
croak("qr// reference to a re::engine::RE2 instance required");
|
||||
rx = SvRX(self);
|
||||
RETVAL = RE2_named_captures(aTHX_ rx);
|
||||
OUTPUT:
|
||||
RETVAL
|
||||
|
||||
int
|
||||
number_of_capture_groups(SV *self)
|
||||
PROTOTYPE:
|
||||
CODE:
|
||||
REGEXP* rx;
|
||||
if(!SvROK(self) || 0 != strcmp("re::engine::RE2", sv_reftype(SvRV(self), TRUE)))
|
||||
croak("qr// reference to a re::engine::RE2 instance required");
|
||||
rx = SvRX(self);
|
||||
RETVAL = RE2_number_of_capture_groups(aTHX_ rx);
|
||||
OUTPUT:
|
||||
RETVAL
|
|
@ -0,0 +1,145 @@
|
|||
NAME
|
||||
re::engine::RE2 - RE2 regex engine
|
||||
|
||||
SYNOPSIS
|
||||
use re::engine::RE2;
|
||||
|
||||
if ("Hello, world" =~ /Hello, (world)/) {
|
||||
print "Greetings, $1!";
|
||||
}
|
||||
|
||||
DESCRIPTION
|
||||
This module replaces perl's regex engine in a given lexical scope with
|
||||
RE2.
|
||||
|
||||
RE2 is a primarily DFA based regexp engine from Google that is very fast
|
||||
at matching large amounts of text. However it does not support look
|
||||
behind and some other Perl regular expression features. See
|
||||
http://code.google.com/p/re2 for more information.
|
||||
|
||||
Fallback to normal Perl regexp is implemented by this module. If RE2 is
|
||||
unable to compile a regexp it will use Perl instead, therefore features
|
||||
not implemented by RE2 don't suddenly stop working, they will just use
|
||||
Perl's regexp implementation.
|
||||
|
||||
METHODS
|
||||
To access extra functionality of RE2 methods can be called on a compiled
|
||||
regular expression (i.e. a "qr//").
|
||||
|
||||
* "possible_match_range([length = 10])"
|
||||
|
||||
Returns an array of two strings: where the expression will start
|
||||
matching and just after where it will finish matching. See RE2's
|
||||
documentation on PossibleMatchRange for further details.
|
||||
|
||||
Example:
|
||||
|
||||
my($min, $max) = qr/^(a|b)/->possible_match_range;
|
||||
is $min, 'a';
|
||||
is $max, 'c';'
|
||||
|
||||
PERFORMANCE
|
||||
Performance is really the primary reason for using RE2, so here's some
|
||||
benchmarks. Like any benchmark take them with a pinch of salt.
|
||||
|
||||
Simple matching
|
||||
my $foo = "foo bar baz";
|
||||
$foo =~ /foo/;
|
||||
$foo =~ /foox/;
|
||||
|
||||
On this very simple match RE2 is actually slower:
|
||||
|
||||
Rate re2 re
|
||||
re2 674634/s -- -76%
|
||||
re 2765739/s 310% --
|
||||
|
||||
URL matching
|
||||
Matching "m{([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^
|
||||
@]+)}" against a several KB file:
|
||||
|
||||
Rate re re2
|
||||
re 35.2/s -- -99%
|
||||
re2 2511/s 7037% --
|
||||
|
||||
Many alternatives
|
||||
Matching a string against a regexp with 17,576 alternatives ("aaa ..
|
||||
zzz").
|
||||
|
||||
This uses trie matching on Perl (obviously RE2 does similar by default).
|
||||
|
||||
$ perl misc/altern.pl
|
||||
Rate re re2
|
||||
re 52631/s -- -91%
|
||||
re2 554938/s 954% --
|
||||
|
||||
NOTES
|
||||
* No support for "m//x"
|
||||
|
||||
The "/x" modifier is not supported. (There's no particular reason
|
||||
for this, just RE2 itself doesn't support it). Fallback to Perl
|
||||
regexp will happen automatically if "//x" is used.
|
||||
|
||||
* "re2/dfa.cc:447: DFA out of memory: prog size xxx mem yyy"
|
||||
|
||||
If you attempt to compile a really large regular expression you may
|
||||
get this error. RE2 has an internal limit on memory consumption for
|
||||
the DFA state tables. By default this is 8 MiB.
|
||||
|
||||
If you need to increase this size then use the max_mem parameter:
|
||||
|
||||
use re::engine::RE2 -max_mem => 8<<23; # 64MiB
|
||||
|
||||
* How do I tell if RE2 will be used?
|
||||
|
||||
See if your regexp is matching quickly or slowly ;).
|
||||
|
||||
Alternatively normal OO concepts apply and you may examine the
|
||||
object returned by "qr//":
|
||||
|
||||
use re::engine::RE2;
|
||||
|
||||
ok qr/foo/->isa("re::engine::RE2");
|
||||
|
||||
# Perl Regexp used instead
|
||||
ok not qr/(?<=foo)bar/->isa("re::engine::RE2");
|
||||
|
||||
BUGS
|
||||
Known issues:
|
||||
|
||||
* Unicode handling
|
||||
|
||||
Currently the Unicode handling of re::engine::RE2 does not fully
|
||||
match Perl's behaviour.
|
||||
|
||||
The UTF-8 flag of the regexp currently determines how the string is
|
||||
matched. This is obviously broken, so will be fixed at some point.
|
||||
|
||||
* Final newline matching differs to Perl
|
||||
|
||||
"\n" =~ /$/
|
||||
|
||||
The above is true in Perl, false in RE2. To work around the issue
|
||||
you can write "\n?\z" when you mean Perl's "$".
|
||||
|
||||
Please report bugs or provide patches at
|
||||
<https://github.com/dgl/re-engine-RE2>.
|
||||
|
||||
AUTHORS
|
||||
David Leadbeater <dgl[at]dgl[dot]cx>
|
||||
|
||||
COPYRIGHT
|
||||
Copyright 2010 David Leadbeater.
|
||||
|
||||
Based on re::engine::PCRE:
|
||||
|
||||
Copyright 2007 Ævar Arnfjörð Bjarmason.
|
||||
|
||||
The original version was copyright 2006 Audrey Tang <cpan@audreyt.org>
|
||||
and Yves Orton.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
(However the bundled copy of RE2 has a different copyright owner and is
|
||||
under a BSD-like license, see re2/LICENSE.)
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
# -*- mode: org -*-
|
||||
|
||||
* Fix UTF-8 support
|
||||
This turns out to be harder than I was thinking. The first step is to compile
|
||||
two versions of the regexp, one for matching UTF-8 and one for matching
|
||||
Latin1 (maybe on demand).
|
||||
|
||||
RE2 won't accept \x{...} escapes that are greater than the current character
|
||||
set. I was hoping it would be possible to give a string containing these to
|
||||
RE2 then let RE2 realise part of it won't match (e.g. (?:foo|\x{1234}) will
|
||||
still match foo, even if the input string isn't UTF-8).
|
||||
|
||||
(I'm only talking about \x{...}; this is the only case I have to
|
||||
care about, \p{...} *are* accepted by RE2 regardless. Due to Perl's
|
||||
behaviour we can't have raw UTF-8 in the string if the UTF-8 flag
|
||||
isn't on.)
|
||||
|
||||
The approach for now will probably be to replace \x{nnn} in strings (where
|
||||
nnn>0xFF) with something that won't match (maybe [^\x00-\xff]), but allows
|
||||
the other branches to match.
|
||||
** Think about supporting perl 5.14's unicode regexp flags
|
||||
At least at the top level, implementing within RE2 would be silly.
|
||||
|
||||
RE2 doesn't have all the behaviours perl does (i.e. /a is implied
|
||||
for \d, etc.). Might just be a case of documenting what RE2 does,
|
||||
once UTF-8 is working to some extent. An alternative could be to
|
||||
make things explicit (e.g. you need to say "no feature
|
||||
'unicode_strings'" if you happen to have enabled them to use RE2).
|
||||
* Switch to dzil
|
||||
* Support more options
|
||||
** never_nl could be useful for cpangrep optimisations
|
||||
* Support RE2::Set functionality
|
||||
i.e. a Regexp::RE2::Set class that can have RE2 regexps added into it
|
||||
then either a match method or maybe overload ~~?
|
||||
* Improve tests
|
||||
** See if t/re/re_tests from Perl can be used.
|
||||
** Improve performance comparisons
|
||||
See maybe https://github.com/axiak/pyre2/blob/master/tests/performance.py
|
||||
* Support /x (probably needs RE2 changes to do properly)
|
||||
* Both Perl and RE2 store the stringification of the regexp, can we avoid this?
|
|
@ -0,0 +1,10 @@
|
|||
/* Compatibility for bits of the cophh API which was added in 5.13.7.
|
||||
* This uses refcounted_he_* functions that are not part of the public perl
|
||||
* API, therefore won't work on platforms with strict linkers (Windows, AIX).
|
||||
*/
|
||||
#if PERL_VERSION < 13 || (PERL_VERSION == 13 && PERL_SUBVERSION < 7)
|
||||
|
||||
#define cophh_fetch_pvs(cophh, key, flags) \
|
||||
Perl_refcounted_he_fetch(aTHX_ cophh, NULL, key, sizeof(key) - 1, 0, flags)
|
||||
|
||||
#endif
|
|
@ -0,0 +1,7 @@
|
|||
/* Compatibility for RX_* macros added around 5.10.1. */
|
||||
|
||||
#ifndef RX_WRAPPED
|
||||
#define RX_WRAPPED(prog) ((prog)->wrapped)
|
||||
#define RX_WRAPLEN(prog) ((prog)->wraplen)
|
||||
#endif
|
||||
|
|
@ -0,0 +1,270 @@
|
|||
package re::engine::RE2;
|
||||
use 5.012;
|
||||
|
||||
BEGIN {
|
||||
$re::engine::RE2::VERSION = "0.14";
|
||||
}
|
||||
|
||||
use XSLoader ();
|
||||
|
||||
# All engines should subclass the core Regexp package
|
||||
our @ISA = 'Regexp';
|
||||
|
||||
BEGIN
|
||||
{
|
||||
XSLoader::load __PACKAGE__, $re::engine::RE2::VERSION;
|
||||
}
|
||||
|
||||
sub import
|
||||
{
|
||||
my $class = shift;
|
||||
|
||||
$^H{regcomp} = ENGINE;
|
||||
|
||||
if (@_) {
|
||||
my %args = @_;
|
||||
if (exists $args{"-max_mem"}) {
|
||||
$^H{__PACKAGE__ . "::max-mem"} = $args{"-max_mem"};
|
||||
}
|
||||
|
||||
if (exists $args{"-strict"}) {
|
||||
$^H{__PACKAGE__ . "::strict"} = $args{"-strict"};
|
||||
}
|
||||
|
||||
if (exists $args{"-longest_match"}) {
|
||||
$^H{__PACKAGE__ . "::longest-match"} = $args{"-longest_match"};
|
||||
}
|
||||
|
||||
if (exists $args{"-never_nl"}) {
|
||||
$^H{__PACKAGE__ . "::never-nl"} = $args{"-never_nl"};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub unimport
|
||||
{
|
||||
delete $^H{regcomp}
|
||||
if $^H{regcomp} == ENGINE;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=encoding utf8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
re::engine::RE2 - RE2 regex engine
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use re::engine::RE2;
|
||||
|
||||
if ("Hello, world" =~ /Hello, (world)/) {
|
||||
print "Greetings, $1!";
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module replaces perl's regex engine in a given lexical scope with RE2.
|
||||
|
||||
RE2 is a primarily DFA based regexp engine from Google that is very fast at
|
||||
matching large amounts of text. However it does not support look behind and
|
||||
some other Perl regular expression features. See
|
||||
L<RE2's website|http://code.google.com/p/re2> for more information.
|
||||
|
||||
Fallback to normal Perl regexp is implemented by this module. If RE2 is unable
|
||||
to compile a regexp it will use Perl instead, therefore features not
|
||||
implemented by RE2 don't suddenly stop working, they will just use Perl's
|
||||
regexp implementation.
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
To access extra functionality of RE2 methods can be called on a compiled
|
||||
regular expression (i.e. a C<qr//>).
|
||||
|
||||
=over 4
|
||||
|
||||
=item * C<possible_match_range([length = 10])>
|
||||
|
||||
Returns an array of two strings: where the expression will start matching and
|
||||
just after where it will finish matching. See RE2's documentation on
|
||||
PossibleMatchRange for further details.
|
||||
|
||||
Example:
|
||||
|
||||
my($min, $max) = qr/^(a|b)/->possible_match_range;
|
||||
is $min, 'a';
|
||||
is $max, 'c';'
|
||||
|
||||
=item * C<named_captures()>
|
||||
|
||||
Returns a hash of the name captures and index.
|
||||
|
||||
Example:
|
||||
|
||||
my $named_captures = qr/(?P<a>\w+) (?P<d>\w+)/->named_captures;
|
||||
is $named_captures->{a}, 1;
|
||||
is $named_captures->{d}, 2;
|
||||
|
||||
=item * C<number_of_capture_groups()>
|
||||
|
||||
Return number of capture groups
|
||||
|
||||
Example:
|
||||
|
||||
my $captures = qr/(Hello), (world)/->number_of_capture_groups;
|
||||
is $captures, 2;
|
||||
|
||||
=back
|
||||
|
||||
=head1 PRAGMA OPTIONS
|
||||
|
||||
Various options can be set by providing options to the C<use> line. These will
|
||||
be pragma scoped.
|
||||
|
||||
=over 4
|
||||
|
||||
=item * C<< -max_mem => 1<<24 >>
|
||||
|
||||
Configure RE2's memory limit.
|
||||
|
||||
=item * C<< -strict => 1 >>
|
||||
|
||||
Be strict, i.e. don't allow regexps that are not supported by RE2.
|
||||
|
||||
=item * C<< -longest_match => 1 >>
|
||||
|
||||
Match on the longest match in alternations. For example with this option set
|
||||
matching C<"abc"> against C<(a|abc)> will match C<"abc">, without depending on
|
||||
order.
|
||||
|
||||
=item * C<< -never_nl => 1 >>
|
||||
|
||||
Never match a newline (C<"\n">) even if the provided regexp contains it.
|
||||
|
||||
=back
|
||||
|
||||
=head1 PERFORMANCE
|
||||
|
||||
Performance is really the primary reason for using RE2, so here's some
|
||||
benchmarks. Like any benchmark take them with a pinch of salt.
|
||||
|
||||
=head2 Simple matching
|
||||
|
||||
my $foo = "foo bar baz";
|
||||
$foo =~ /foo/;
|
||||
$foo =~ /foox/;
|
||||
|
||||
On this very simple match RE2 is actually slower:
|
||||
|
||||
Rate re2 re
|
||||
re2 674634/s -- -76%
|
||||
re 2765739/s 310% --
|
||||
|
||||
=head2 URL matching
|
||||
|
||||
Matching C<m{([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^
|
||||
@]+)}> against a several KB file:
|
||||
|
||||
Rate re re2
|
||||
re 35.2/s -- -99%
|
||||
re2 2511/s 7037% --
|
||||
|
||||
=head2 Many alternatives
|
||||
|
||||
Matching a string against a regexp with 17,576 alternatives (C<aaa .. zzz>).
|
||||
|
||||
This uses trie matching on Perl (obviously RE2 does similar by default).
|
||||
|
||||
$ perl misc/altern.pl
|
||||
Rate re re2
|
||||
re 52631/s -- -91%
|
||||
re2 554938/s 954% --
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
=over 4
|
||||
|
||||
=item * No support for C<m//x>
|
||||
|
||||
The C</x> modifier is not supported. (There's no particular reason for this,
|
||||
just RE2 itself doesn't support it). Fallback to Perl regexp will happen
|
||||
automatically if C<//x> is used.
|
||||
|
||||
=item * "re2/dfa.cc:447: DFA out of memory: prog size xxx mem yyy"
|
||||
|
||||
If you attempt to compile a really large regular expression you may get this
|
||||
error. RE2 has an internal limit on memory consumption for the DFA state
|
||||
tables. By default this is 8 MiB.
|
||||
|
||||
If you need to increase this size then use the max_mem parameter:
|
||||
|
||||
use re::engine::RE2 -max_mem => 8<<23; # 64MiB
|
||||
|
||||
=item * How do I tell if RE2 will be used?
|
||||
|
||||
See if your regexp is matching quickly or slowly ;).
|
||||
|
||||
Alternatively normal OO concepts apply and you may examine the object returned
|
||||
by C<qr//>:
|
||||
|
||||
use re::engine::RE2;
|
||||
|
||||
ok qr/foo/->isa("re::engine::RE2");
|
||||
|
||||
# Perl Regexp used instead
|
||||
ok not qr/(?<=foo)bar/->isa("re::engine::RE2");
|
||||
|
||||
If you wish to force RE2, use the C<-strict> option.
|
||||
|
||||
=back
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Known issues:
|
||||
|
||||
=over 4
|
||||
|
||||
=item * Unicode handling
|
||||
|
||||
Currently the Unicode handling of re::engine::RE2 does not fully match Perl's
|
||||
behaviour.
|
||||
|
||||
The UTF-8 flag of the regexp currently determines how the string is matched.
|
||||
This is obviously broken, so will be fixed at some point.
|
||||
|
||||
=item * Final newline matching differs to Perl
|
||||
|
||||
"\n" =~ /$/
|
||||
|
||||
The above is true in Perl, false in RE2. To work around the issue you can write
|
||||
C<\n?\z> when you mean Perl's C<$>.
|
||||
|
||||
=back
|
||||
|
||||
Please report bugs or provide patches at <https://github.com/dgl/re-engine-RE2>.
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
David Leadbeater E<lt>dgl[at]dgl[dot]cxE<gt>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 2010 David Leadbeater.
|
||||
|
||||
Based on L<re::engine::PCRE>:
|
||||
|
||||
Copyright 2007 E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason.
|
||||
|
||||
The original version was copyright 2006 Audrey Tang
|
||||
E<lt>cpan@audreyt.orgE<gt> and Yves Orton.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
(However the bundled copy of RE2 has a different copyright owner and is under a
|
||||
BSD-like license, see F<re2/LICENSE>.)
|
||||
|
||||
=cut
|
|
@ -0,0 +1,7 @@
|
|||
syntax:glob
|
||||
*.pyc
|
||||
*.orig
|
||||
core
|
||||
|
||||
syntax:regexp
|
||||
^obj/
|
|
@ -0,0 +1,12 @@
|
|||
# This is the official list of RE2 authors for copyright purposes.
|
||||
# This file is distinct from the CONTRIBUTORS files.
|
||||
# See the latter for an explanation.
|
||||
|
||||
# Names should be added to this file as
|
||||
# Name or Organization <email address>
|
||||
# The email address is not required for organizations.
|
||||
|
||||
# Please keep the list sorted.
|
||||
|
||||
Google Inc.
|
||||
Stefano Rivera <stefano.rivera@gmail.com>
|
|
@ -0,0 +1,33 @@
|
|||
# This is the official list of people who can contribute
|
||||
# (and typically have contributed) code to the RE2 repository.
|
||||
# The AUTHORS file lists the copyright holders; this file
|
||||
# lists people. For example, Google employees are listed here
|
||||
# but not in AUTHORS, because Google holds the copyright.
|
||||
#
|
||||
# The submission process automatically checks to make sure
|
||||
# that people submitting code are listed in this file (by email address).
|
||||
#
|
||||
# Names should be added to this file only after verifying that
|
||||
# the individual or the individual's organization has agreed to
|
||||
# the appropriate Contributor License Agreement, found here:
|
||||
#
|
||||
# http://code.google.com/legal/individual-cla-v1.0.html
|
||||
# http://code.google.com/legal/corporate-cla-v1.0.html
|
||||
#
|
||||
# The agreement for individuals can be filled out on the web.
|
||||
#
|
||||
# When adding J Random Contributor's name to this file,
|
||||
# either J's name or J's organization's name should be
|
||||
# added to the AUTHORS file, depending on whether the
|
||||
# individual or corporate CLA was used.
|
||||
|
||||
# Names should be added to this file like so:
|
||||
# Name <email address>
|
||||
|
||||
# Please keep the list sorted.
|
||||
|
||||
Rob Pike <r@google.com>
|
||||
Russ Cox <rsc@swtch.com>
|
||||
Sanjay Ghemawat <sanjay@google.com>
|
||||
Stefano Rivera <stefano.rivera@gmail.com>
|
||||
Srinivasan Venkatachary <vsri@google.com>
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,287 @@
|
|||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
all: obj/libre2.a obj/so/libre2.so
|
||||
|
||||
# to build against PCRE for testing or benchmarking,
|
||||
# uncomment the next two lines
|
||||
# CCPCRE=-I/usr/local/include -DUSEPCRE
|
||||
# LDPCRE=-L/usr/local/lib -lpcre
|
||||
|
||||
#CC=g++
|
||||
#CXXFLAGS=-Wall -O3 -g -pthread # can override
|
||||
RE2_CXXFLAGS=-Wno-sign-compare -c -I. $(CCPCRE) # required
|
||||
#LDFLAGS=-pthread
|
||||
AR=ar
|
||||
ARFLAGS=rsc
|
||||
NM=nm
|
||||
NMFLAGS=-p
|
||||
|
||||
# Variables mandated by GNU, the arbiter of all good taste on the internet.
|
||||
# http://www.gnu.org/prep/standards/standards.html
|
||||
prefix=/usr/local
|
||||
exec_prefix=$(prefix)
|
||||
bindir=$(exec_prefix)/bin
|
||||
includedir=$(prefix)/include
|
||||
libdir=$(exec_prefix)/lib
|
||||
INSTALL=install
|
||||
INSTALL_PROGRAM=$(INSTALL)
|
||||
INSTALL_DATA=$(INSTALL) -m 644
|
||||
|
||||
# ABI version
|
||||
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
|
||||
SONAME=0
|
||||
|
||||
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
|
||||
# access for Unicode data), uncomment the following line:
|
||||
# REBUILD_TABLES=1
|
||||
|
||||
#ifeq ($(shell uname),Darwin)
|
||||
#MAKE_SHARED_LIBRARY=g++ -dynamiclib $(LDFLAGS) -exported_symbols_list libre2.symbols.darwin
|
||||
#else
|
||||
#MAKE_SHARED_LIBRARY=g++ -shared -Wl,-soname,libre2.so.0,--version-script=libre2.symbols $(LDFLAGS)
|
||||
#endif
|
||||
|
||||
INSTALL_HFILES=\
|
||||
re2/re2.h\
|
||||
re2/set.h\
|
||||
re2/stringpiece.h\
|
||||
re2/variadic_function.h\
|
||||
|
||||
HFILES=\
|
||||
util/arena.h\
|
||||
util/atomicops.h\
|
||||
util/benchmark.h\
|
||||
util/flags.h\
|
||||
util/logging.h\
|
||||
util/mutex.h\
|
||||
util/pcre.h\
|
||||
util/random.h\
|
||||
util/sparse_array.h\
|
||||
util/sparse_set.h\
|
||||
util/test.h\
|
||||
util/utf.h\
|
||||
util/util.h\
|
||||
util/valgrind.h\
|
||||
re2/filtered_re2.h\
|
||||
re2/prefilter.h\
|
||||
re2/prefilter_tree.h\
|
||||
re2/prog.h\
|
||||
re2/re2.h\
|
||||
re2/regexp.h\
|
||||
re2/set.h\
|
||||
re2/stringpiece.h\
|
||||
re2/testing/exhaustive_tester.h\
|
||||
re2/testing/regexp_generator.h\
|
||||
re2/testing/string_generator.h\
|
||||
re2/testing/tester.h\
|
||||
re2/unicode_casefold.h\
|
||||
re2/unicode_groups.h\
|
||||
re2/variadic_function.h\
|
||||
re2/walker-inl.h\
|
||||
|
||||
OFILES=\
|
||||
obj/util/arena.o\
|
||||
obj/util/hash.o\
|
||||
obj/util/rune.o\
|
||||
obj/util/stringpiece.o\
|
||||
obj/util/stringprintf.o\
|
||||
obj/util/strutil.o\
|
||||
obj/util/valgrind.o\
|
||||
obj/re2/bitstate.o\
|
||||
obj/re2/compile.o\
|
||||
obj/re2/dfa.o\
|
||||
obj/re2/filtered_re2.o\
|
||||
obj/re2/mimics_pcre.o\
|
||||
obj/re2/nfa.o\
|
||||
obj/re2/onepass.o\
|
||||
obj/re2/parse.o\
|
||||
obj/re2/perl_groups.o\
|
||||
obj/re2/prefilter.o\
|
||||
obj/re2/prefilter_tree.o\
|
||||
obj/re2/prog.o\
|
||||
obj/re2/re2.o\
|
||||
obj/re2/regexp.o\
|
||||
obj/re2/set.o\
|
||||
obj/re2/simplify.o\
|
||||
obj/re2/tostring.o\
|
||||
obj/re2/unicode_casefold.o\
|
||||
obj/re2/unicode_groups.o\
|
||||
|
||||
TESTOFILES=\
|
||||
obj/util/pcre.o\
|
||||
obj/util/random.o\
|
||||
obj/util/thread.o\
|
||||
obj/re2/testing/backtrack.o\
|
||||
obj/re2/testing/dump.o\
|
||||
obj/re2/testing/exhaustive_tester.o\
|
||||
obj/re2/testing/null_walker.o\
|
||||
obj/re2/testing/regexp_generator.o\
|
||||
obj/re2/testing/string_generator.o\
|
||||
obj/re2/testing/tester.o\
|
||||
|
||||
TESTS=\
|
||||
obj/test/charclass_test\
|
||||
obj/test/compile_test\
|
||||
obj/test/filtered_re2_test\
|
||||
obj/test/mimics_pcre_test\
|
||||
obj/test/parse_test\
|
||||
obj/test/possible_match_test\
|
||||
obj/test/re2_test\
|
||||
obj/test/re2_arg_test\
|
||||
obj/test/regexp_test\
|
||||
obj/test/required_prefix_test\
|
||||
obj/test/search_test\
|
||||
obj/test/set_test\
|
||||
obj/test/simplify_test\
|
||||
obj/test/string_generator_test\
|
||||
|
||||
BIGTESTS=\
|
||||
obj/test/dfa_test\
|
||||
obj/test/exhaustive1_test\
|
||||
obj/test/exhaustive2_test\
|
||||
obj/test/exhaustive3_test\
|
||||
obj/test/exhaustive_test\
|
||||
obj/test/random_test\
|
||||
|
||||
SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
|
||||
STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
|
||||
STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
|
||||
SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
|
||||
|
||||
DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
|
||||
DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
|
||||
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
|
||||
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
|
||||
|
||||
obj:
|
||||
mkdir $@
|
||||
|
||||
obj/re2: obj
|
||||
cd obj && mkdir re2 || echo Okay
|
||||
|
||||
obj/util: obj
|
||||
cd obj && mkdir util || echo Okay
|
||||
|
||||
obj/test: obj
|
||||
cd obj && mkdir test || echo Okay
|
||||
|
||||
obj/re2/testing: obj/re2
|
||||
cd obj/re2 && mkdir testing || echo Okay
|
||||
|
||||
obj/%.o: obj/re2 obj/re2/testing obj/util %.cc $(HFILES)
|
||||
$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
|
||||
|
||||
obj/dbg/%.o: obj/dbg %.cc $(HFILES)
|
||||
$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
|
||||
|
||||
obj/so/%.o: obj/so %.cc $(HFILES)
|
||||
$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
|
||||
|
||||
obj/%.o: obj %.c $(HFILES)
|
||||
$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
|
||||
|
||||
obj/dbg/%.o: obj/dbg %.c $(HFILES)
|
||||
$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) $*.c
|
||||
|
||||
obj/so/%.o: obj/so %.c $(HFILES)
|
||||
$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
|
||||
|
||||
obj/libre2.a: $(OFILES)
|
||||
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
|
||||
|
||||
obj/dbg/libre2.a: obj/dbg $(DOFILES)
|
||||
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
|
||||
|
||||
obj/so/libre2.so: obj/so $(SOFILES)
|
||||
$(MAKE_SHARED_LIBRARY) -o $@.0 $(SOFILES)
|
||||
ln -sf libre2.so.0 $@
|
||||
|
||||
obj/test/%: obj/test obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
|
||||
$(CC) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
obj/dbg/test/%: obj/dbg/test obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
|
||||
$(CC) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
|
||||
$(CC) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
obj/test/regexp_benchmark: obj/test obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
|
||||
$(CC) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
ifdef REBUILD_TABLES
|
||||
re2/perl_groups.cc: re2/make_perl_groups.pl
|
||||
perl $< > $@
|
||||
|
||||
re2/unicode_%.cc: re2/make_unicode_%.py
|
||||
python $< > $@
|
||||
endif
|
||||
|
||||
distclean: clean
|
||||
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
|
||||
|
||||
clean:
|
||||
rm -rf obj
|
||||
rm -f re2/*.pyc
|
||||
|
||||
testofiles: $(TESTOFILES)
|
||||
|
||||
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
|
||||
|
||||
debug-test: $(DTESTS)
|
||||
@echo
|
||||
@echo Running debug binary tests.
|
||||
@echo
|
||||
@./runtests $(DTESTS)
|
||||
|
||||
static-test: $(TESTS)
|
||||
@echo
|
||||
@echo Running static binary tests.
|
||||
@echo
|
||||
@./runtests $(TESTS)
|
||||
|
||||
shared-test: $(STESTS)
|
||||
@echo
|
||||
@echo Running dynamic binary tests.
|
||||
@echo
|
||||
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS)
|
||||
|
||||
debug-bigtest: $(DTESTS) $(DBIGTESTS)
|
||||
@./runtests $(DTESTS) $(DBIGTESTS)
|
||||
|
||||
static-bigtest: $(TESTS) $(BIGTESTS)
|
||||
@./runtests $(TESTS) $(BIGTESTS)
|
||||
|
||||
shared-bigtest: $(STESTS) $(SBIGTESTS)
|
||||
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS)
|
||||
|
||||
benchmark: obj/test/regexp_benchmark
|
||||
|
||||
install: obj/libre2.a obj/so/libre2.so.0
|
||||
mkdir -p $(DESTDIR)$(includedir)/re2
|
||||
$(INSTALL_DATA) $(DESTDIR)$(INSTALL_HFILES) $(includedir)/re2
|
||||
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
|
||||
$(INSTALL) obj/so/libre2.so $(DESTDIR)$(libdir)/libre2.so.$(SONAME).0.0
|
||||
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so.$(SONAME)
|
||||
ln -sf libre2.so.$(SONAME).0.0 $(DESTDIR)$(libdir)/libre2.so
|
||||
|
||||
testinstall:
|
||||
@mkdir -p obj
|
||||
cp testinstall.cc obj
|
||||
(cd obj && g++ -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
|
||||
LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
|
||||
|
||||
benchlog: obj/test/regexp_benchmark
|
||||
(echo '==BENCHMARK==' `hostname` `date`; \
|
||||
(uname -a; g++ --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
|
||||
echo; \
|
||||
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
|
||||
|
||||
# Keep gmake from deleting intermediate files it creates.
|
||||
# This makes repeated builds faster and preserves debug info on OS X.
|
||||
|
||||
.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
|
||||
obj/dbg/libre2.a obj/so/libre2.a \
|
||||
obj/test/% obj/so/test/% obj/dbg/test/%
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
This is the source code repository for RE2, a regular expression library.
|
||||
|
||||
For documentation about how to install and use RE2,
|
||||
visit http://code.google.com/p/re2/.
|
||||
|
||||
The short version is:
|
||||
|
||||
make
|
||||
make test
|
||||
make install
|
||||
make testinstall
|
||||
|
||||
Unless otherwise noted, the RE2 source files are distributed
|
||||
under the BSD-style license found in the LICENSE file.
|
||||
|
||||
RE2's native language is C++.
|
||||
An Inferno wrapper is at http://code.google.com/p/inferno-re2/.
|
||||
A Python wrapper is at http://github.com/facebook/pyre2/.
|
||||
A Ruby wrapper is at http://github.com/axic/rre2/.
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
global:
|
||||
# re2::RE2*
|
||||
_ZN3re23RE2*;
|
||||
_ZNK3re23RE2*;
|
||||
# re2::StringPiece*
|
||||
_ZN3re211StringPiece*;
|
||||
_ZNK3re211StringPiece*;
|
||||
# operator==(re2::StringPiece const&, re2::StringPiece const&)
|
||||
_ZeqRKN3re211StringPieceES2_;
|
||||
# operator<<(std::ostream&, re2::StringPiece const&)
|
||||
_ZlsRSoRKN3re211StringPieceE;
|
||||
local:
|
||||
*;
|
||||
};
|
|
@ -0,0 +1,11 @@
|
|||
# Linker doesn't like these unmangled:
|
||||
# re2::RE2*
|
||||
__ZN3re23RE2*
|
||||
__ZNK3re23RE2*
|
||||
# re2::StringPiece*
|
||||
__ZN3re211StringPiece*
|
||||
__ZNK3re211StringPiece*
|
||||
# operator==(re2::StringPiece const&, re2::StringPiece const&)
|
||||
__ZeqRKN3re211StringPieceES2_
|
||||
# operator<<(std::ostream&, re2::StringPiece const&)
|
||||
__ZlsRSoRKN3re211StringPieceE
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,378 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
||||
|
||||
// Prog::SearchBitState is a regular expression search with submatch
|
||||
// tracking for small regular expressions and texts. Like
|
||||
// testing/backtrack.cc, it allocates a bit vector with (length of
|
||||
// text) * (length of prog) bits, to make sure it never explores the
|
||||
// same (character position, instruction) state multiple times. This
|
||||
// limits the search to run in time linear in the length of the text.
|
||||
//
|
||||
// Unlike testing/backtrack.cc, SearchBitState is not recursive
|
||||
// on the text.
|
||||
//
|
||||
// SearchBitState is a fast replacement for the NFA code on small
|
||||
// regexps and texts when SearchOnePass cannot be used.
|
||||
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct Job {
|
||||
int id;
|
||||
int arg;
|
||||
const char* p;
|
||||
};
|
||||
|
||||
class BitState {
|
||||
public:
|
||||
explicit BitState(Prog* prog);
|
||||
~BitState();
|
||||
|
||||
// The usual Search prototype.
|
||||
// Can only call Search once per BitState.
|
||||
bool Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
inline bool ShouldVisit(int id, const char* p);
|
||||
void Push(int id, const char* p, int arg);
|
||||
bool GrowStack();
|
||||
bool TrySearch(int id, const char* p);
|
||||
|
||||
// Search parameters
|
||||
Prog* prog_; // program being run
|
||||
StringPiece text_; // text being searched
|
||||
StringPiece context_; // greater context of text being searched
|
||||
bool anchored_; // whether search is anchored at text.begin()
|
||||
bool longest_; // whether search wants leftmost-longest match
|
||||
bool endmatch_; // whether match must end at text.end()
|
||||
StringPiece *submatch_; // submatches to fill in
|
||||
int nsubmatch_; // # of submatches to fill in
|
||||
|
||||
// Search state
|
||||
const char** cap_; // capture registers
|
||||
int ncap_;
|
||||
|
||||
static const int VisitedBits = 32;
|
||||
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||
int nvisited_; // # of words in bitmap
|
||||
|
||||
Job *job_; // stack of text positions to explore
|
||||
int njob_;
|
||||
int maxjob_;
|
||||
};
|
||||
|
||||
BitState::BitState(Prog* prog)
|
||||
: prog_(prog),
|
||||
anchored_(false),
|
||||
longest_(false),
|
||||
endmatch_(false),
|
||||
submatch_(NULL),
|
||||
nsubmatch_(0),
|
||||
cap_(NULL),
|
||||
ncap_(0),
|
||||
visited_(NULL),
|
||||
nvisited_(0),
|
||||
job_(NULL),
|
||||
njob_(0),
|
||||
maxjob_(0) {
|
||||
}
|
||||
|
||||
BitState::~BitState() {
|
||||
delete[] visited_;
|
||||
delete[] job_;
|
||||
delete[] cap_;
|
||||
}
|
||||
|
||||
// Should the search visit the pair ip, p?
|
||||
// If so, remember that it was visited so that the next time,
|
||||
// we don't repeat the visit.
|
||||
bool BitState::ShouldVisit(int id, const char* p) {
|
||||
uint n = id * (text_.size() + 1) + (p - text_.begin());
|
||||
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
||||
return false;
|
||||
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Grow the stack.
|
||||
bool BitState::GrowStack() {
|
||||
// VLOG(0) << "Reallocate.";
|
||||
maxjob_ *= 2;
|
||||
Job* newjob = new Job[maxjob_];
|
||||
memmove(newjob, job_, njob_*sizeof job_[0]);
|
||||
delete[] job_;
|
||||
job_ = newjob;
|
||||
if (njob_ >= maxjob_) {
|
||||
LOG(DFATAL) << "Job stack overflow.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
|
||||
void BitState::Push(int id, const char* p, int arg) {
|
||||
if (njob_ >= maxjob_) {
|
||||
if (!GrowStack())
|
||||
return;
|
||||
}
|
||||
int op = prog_->inst(id)->opcode();
|
||||
if (op == kInstFail)
|
||||
return;
|
||||
|
||||
// Only check ShouldVisit when arg == 0.
|
||||
// When arg > 0, we are continuing a previous visit.
|
||||
if (arg == 0 && !ShouldVisit(id, p))
|
||||
return;
|
||||
|
||||
Job* j = &job_[njob_++];
|
||||
j->id = id;
|
||||
j->p = p;
|
||||
j->arg = arg;
|
||||
}
|
||||
|
||||
// Try a search from instruction id0 in state p0.
|
||||
// Return whether it succeeded.
|
||||
bool BitState::TrySearch(int id0, const char* p0) {
|
||||
bool matched = false;
|
||||
const char* end = text_.end();
|
||||
njob_ = 0;
|
||||
Push(id0, p0, 0);
|
||||
while (njob_ > 0) {
|
||||
// Pop job off stack.
|
||||
--njob_;
|
||||
int id = job_[njob_].id;
|
||||
const char* p = job_[njob_].p;
|
||||
int arg = job_[njob_].arg;
|
||||
|
||||
// Optimization: rather than push and pop,
|
||||
// code that is going to Push and continue
|
||||
// the loop simply updates ip, p, and arg
|
||||
// and jumps to CheckAndLoop. We have to
|
||||
// do the ShouldVisit check that Push
|
||||
// would have, but we avoid the stack
|
||||
// manipulation.
|
||||
if (0) {
|
||||
CheckAndLoop:
|
||||
if (!ShouldVisit(id, p))
|
||||
continue;
|
||||
}
|
||||
|
||||
// Visit ip, p.
|
||||
// VLOG(0) << "Job: " << ip->id() << " "
|
||||
// << (p - text_.begin()) << " " << arg;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
case kInstFail:
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
||||
return false;
|
||||
|
||||
case kInstAlt:
|
||||
// Cannot just
|
||||
// Push(ip->out1(), p, 0);
|
||||
// Push(ip->out(), p, 0);
|
||||
// If, during the processing of ip->out(), we encounter
|
||||
// ip->out1() via another path, we want to process it then.
|
||||
// Pushing it here will inhibit that. Instead, re-push
|
||||
// ip with arg==1 as a reminder to push ip->out1() later.
|
||||
switch (arg) {
|
||||
case 0:
|
||||
Push(id, p, 1); // come back when we're done
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case 1:
|
||||
// Finished ip->out(); try ip->out1().
|
||||
arg = 0;
|
||||
id = ip->out1();
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
||||
continue;
|
||||
|
||||
case kInstAltMatch:
|
||||
// One opcode is byte range; the other leads to match.
|
||||
if (ip->greedy(prog_)) {
|
||||
// out1 is the match
|
||||
Push(ip->out1(), p, 0);
|
||||
id = ip->out1();
|
||||
p = end;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
// out is the match - non-greedy
|
||||
Push(ip->out(), end, 0);
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstByteRange: {
|
||||
int c = -1;
|
||||
if (p < end)
|
||||
c = *p & 0xFF;
|
||||
if (ip->Matches(c)) {
|
||||
id = ip->out();
|
||||
p++;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
switch (arg) {
|
||||
case 0:
|
||||
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
||||
// Capture p to register, but save old value.
|
||||
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
||||
cap_[ip->cap()] = p;
|
||||
}
|
||||
// Continue on.
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
case 1:
|
||||
// Finished ip->out(); restore the old value.
|
||||
cap_[ip->cap()] = p;
|
||||
continue;
|
||||
}
|
||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
||||
continue;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
continue;
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstMatch: {
|
||||
if (endmatch_ && p != text_.end())
|
||||
continue;
|
||||
|
||||
// VLOG(0) << "Found match.";
|
||||
// We found a match. If the caller doesn't care
|
||||
// where the match is, no point going further.
|
||||
if (nsubmatch_ == 0)
|
||||
return true;
|
||||
|
||||
// Record best match so far.
|
||||
// Only need to check end point, because this entire
|
||||
// call is only considering one start position.
|
||||
matched = true;
|
||||
cap_[1] = p;
|
||||
if (submatch_[0].data() == NULL ||
|
||||
(longest_ && p > submatch_[0].end())) {
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
|
||||
}
|
||||
|
||||
// If going for first match, we're done.
|
||||
if (!longest_)
|
||||
return true;
|
||||
|
||||
// If we used the entire text, no longer match is possible.
|
||||
if (p == text_.end())
|
||||
return true;
|
||||
|
||||
// Otherwise, continue on in hope of a longer match.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return matched;
|
||||
}
|
||||
|
||||
// Search text (within context) for prog_.
|
||||
bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
// Search parameters.
|
||||
text_ = text;
|
||||
context_ = context;
|
||||
if (context_.begin() == NULL)
|
||||
context_ = text;
|
||||
if (prog_->anchor_start() && context_.begin() != text.begin())
|
||||
return false;
|
||||
if (prog_->anchor_end() && context_.end() != text.end())
|
||||
return false;
|
||||
anchored_ = anchored || prog_->anchor_start();
|
||||
longest_ = longest || prog_->anchor_end();
|
||||
endmatch_ = prog_->anchor_end();
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = NULL;
|
||||
|
||||
// Allocate scratch space.
|
||||
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
||||
visited_ = new uint32[nvisited_];
|
||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
||||
// VLOG(0) << "nvisited_ = " << nvisited_;
|
||||
|
||||
ncap_ = 2*nsubmatch;
|
||||
if (ncap_ < 2)
|
||||
ncap_ = 2;
|
||||
cap_ = new const char*[ncap_];
|
||||
memset(cap_, 0, ncap_*sizeof cap_[0]);
|
||||
|
||||
maxjob_ = 256;
|
||||
job_ = new Job[maxjob_];
|
||||
|
||||
// Anchored search must start at text.begin().
|
||||
if (anchored_) {
|
||||
cap_[0] = text.begin();
|
||||
return TrySearch(prog_->start(), text.begin());
|
||||
}
|
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
||||
// This looks like it's quadratic in the size of the text,
|
||||
// but we are not clearing visited_ between calls to TrySearch,
|
||||
// so no work is duplicated and it ends up still being linear.
|
||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
||||
cap_[0] = p;
|
||||
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Bit-state search.
|
||||
bool Prog::SearchBitState(const StringPiece& text,
|
||||
const StringPiece& context,
|
||||
Anchor anchor,
|
||||
MatchKind kind,
|
||||
StringPiece* match,
|
||||
int nmatch) {
|
||||
// If full match, we ask for an anchored longest match
|
||||
// and then check that match[0] == text.
|
||||
// So make sure match[0] exists.
|
||||
StringPiece sp0;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch < 1) {
|
||||
match = &sp0;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the search.
|
||||
BitState b(this);
|
||||
bool anchored = anchor == kAnchored;
|
||||
bool longest = kind != kFirstMatch;
|
||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && match[0].end() != text.end())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,100 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <string>
|
||||
#include "util/util.h"
|
||||
#include "re2/filtered_re2.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
FilteredRE2::FilteredRE2()
|
||||
: compiled_(false),
|
||||
prefilter_tree_(new PrefilterTree()) {
|
||||
}
|
||||
|
||||
FilteredRE2::~FilteredRE2() {
|
||||
for (int i = 0; i < re2_vec_.size(); i++)
|
||||
delete re2_vec_[i];
|
||||
delete prefilter_tree_;
|
||||
}
|
||||
|
||||
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
|
||||
const RE2::Options& options, int* id) {
|
||||
RE2* re = new RE2(pattern, options);
|
||||
RE2::ErrorCode code = re->error_code();
|
||||
|
||||
if (!re->ok()) {
|
||||
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
|
||||
<< re << " due to error " << re->error();
|
||||
delete re;
|
||||
} else {
|
||||
*id = re2_vec_.size();
|
||||
re2_vec_.push_back(re);
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
void FilteredRE2::Compile(vector<string>* atoms) {
|
||||
if (compiled_ || re2_vec_.size() == 0) {
|
||||
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < re2_vec_.size(); i++) {
|
||||
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
|
||||
prefilter_tree_->Add(prefilter);
|
||||
}
|
||||
atoms->clear();
|
||||
prefilter_tree_->Compile(atoms);
|
||||
compiled_ = true;
|
||||
}
|
||||
|
||||
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
||||
for (int i = 0; i < re2_vec_.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
||||
return i;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int FilteredRE2::FirstMatch(const StringPiece& text,
|
||||
const vector<int>& atoms) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "FirstMatch called before Compile";
|
||||
return -1;
|
||||
}
|
||||
vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (int i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
return regexps[i];
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool FilteredRE2::AllMatches(
|
||||
const StringPiece& text,
|
||||
const vector<int>& atoms,
|
||||
vector<int>* matching_regexps) const {
|
||||
matching_regexps->clear();
|
||||
vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (int i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
matching_regexps->push_back(regexps[i]);
|
||||
return !matching_regexps->empty();
|
||||
}
|
||||
|
||||
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* passed_regexps) {
|
||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
||||
}
|
||||
|
||||
|
||||
void FilteredRE2::PrintPrefilter(int regexpid) {
|
||||
prefilter_tree_->PrintPrefilter(regexpid);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,101 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
||||
// It provides a prefilter mechanism that helps in cutting down the
|
||||
// number of regexps that need to be actually searched.
|
||||
//
|
||||
// By design, it does not include a string matching engine. This is to
|
||||
// allow the user of the class to use their favorite string match
|
||||
// engine. The overall flow is: Add all the regexps using Add, then
|
||||
// Compile the FilteredRE2. The compile returns strings that need to
|
||||
// be matched. Note that all returned strings are lowercase. For
|
||||
// applying regexps to a search text, the caller does the string
|
||||
// matching using the strings returned. When doing the string match,
|
||||
// note that the caller has to do that on lower cased version of the
|
||||
// search text. Then call FirstMatch or AllMatches with a vector of
|
||||
// indices of strings that were found in the text to get the actual
|
||||
// regexp matches.
|
||||
|
||||
#ifndef RE2_FILTERED_RE2_H_
|
||||
#define RE2_FILTERED_RE2_H_
|
||||
|
||||
#include <vector>
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::vector;
|
||||
|
||||
class PrefilterTree;
|
||||
|
||||
class FilteredRE2 {
|
||||
public:
|
||||
FilteredRE2();
|
||||
~FilteredRE2();
|
||||
|
||||
// Uses RE2 constructor to create a RE2 object (re). Returns
|
||||
// re->error_code(). If error_code is other than NoError, then re is
|
||||
// deleted and not added to re2_vec_.
|
||||
RE2::ErrorCode Add(const StringPiece& pattern,
|
||||
const RE2::Options& options,
|
||||
int *id);
|
||||
|
||||
// Prepares the regexps added by Add for filtering. Returns a set
|
||||
// of strings that the caller should check for in candidate texts.
|
||||
// The returned strings are lowercased. When doing string matching,
|
||||
// the search text should be lowercased first to find matching
|
||||
// strings from the set of strings returned by Compile. Call after
|
||||
// all Add calls are done.
|
||||
void Compile(vector<string>* strings_to_match);
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Can be called prior to Compile.
|
||||
// Does not do any filtering: simply tries to Match the
|
||||
// regexps in a loop.
|
||||
int SlowFirstMatch(const StringPiece& text) const;
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Compile has to be called before
|
||||
// calling this.
|
||||
int FirstMatch(const StringPiece& text,
|
||||
const vector<int>& atoms) const;
|
||||
|
||||
// Returns the indices of all matching regexps, after first clearing
|
||||
// matched_regexps.
|
||||
bool AllMatches(const StringPiece& text,
|
||||
const vector<int>& atoms,
|
||||
vector<int>* matching_regexps) const;
|
||||
|
||||
// The number of regexps added.
|
||||
int NumRegexps() const { return re2_vec_.size(); }
|
||||
|
||||
private:
|
||||
|
||||
// Get the individual RE2 objects. Useful for testing.
|
||||
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
|
||||
|
||||
// Print prefilter.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
// Useful for testing and debugging.
|
||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* passed_regexps);
|
||||
|
||||
// All the regexps in the FilteredRE2.
|
||||
vector<RE2*> re2_vec_;
|
||||
|
||||
// Has the FilteredRE2 been compiled using Compile()
|
||||
bool compiled_;
|
||||
|
||||
// An AND-OR tree of string atoms used for filtering regexps.
|
||||
PrefilterTree* prefilter_tree_;
|
||||
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
|
||||
FilteredRE2(const FilteredRE2&);
|
||||
void operator=(const FilteredRE2&);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_FILTERED_RE2_H_
|
|
@ -0,0 +1,110 @@
|
|||
#!/usr/bin/perl
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Generate table entries giving character ranges
|
||||
# for POSIX/Perl character classes. Rather than
|
||||
# figure out what the definition is, it is easier to ask
|
||||
# Perl about each letter from 0-128 and write down
|
||||
# its answer.
|
||||
|
||||
@posixclasses = (
|
||||
"[:alnum:]",
|
||||
"[:alpha:]",
|
||||
"[:ascii:]",
|
||||
"[:blank:]",
|
||||
"[:cntrl:]",
|
||||
"[:digit:]",
|
||||
"[:graph:]",
|
||||
"[:lower:]",
|
||||
"[:print:]",
|
||||
"[:punct:]",
|
||||
"[:space:]",
|
||||
"[:upper:]",
|
||||
"[:word:]",
|
||||
"[:xdigit:]",
|
||||
);
|
||||
|
||||
@perlclasses = (
|
||||
"\\d",
|
||||
"\\s",
|
||||
"\\w",
|
||||
);
|
||||
|
||||
sub ComputeClass($) {
|
||||
my @ranges;
|
||||
my ($class) = @_;
|
||||
my $regexp = "[$class]";
|
||||
my $start = -1;
|
||||
for (my $i=0; $i<=129; $i++) {
|
||||
if ($i == 129) { $i = 256; }
|
||||
if ($i <= 128 && chr($i) =~ $regexp) {
|
||||
if ($start < 0) {
|
||||
$start = $i;
|
||||
}
|
||||
} else {
|
||||
if ($start >= 0) {
|
||||
push @ranges, [$start, $i-1];
|
||||
}
|
||||
$start = -1;
|
||||
}
|
||||
}
|
||||
return @ranges;
|
||||
}
|
||||
|
||||
sub PrintClass($$@) {
|
||||
my ($cname, $name, @ranges) = @_;
|
||||
print "static URange16 code${cname}[] = { /* $name */\n";
|
||||
for (my $i=0; $i<@ranges; $i++) {
|
||||
my @a = @{$ranges[$i]};
|
||||
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
|
||||
}
|
||||
print "};\n";
|
||||
my $n = @ranges;
|
||||
my $escname = $name;
|
||||
$escname =~ s/\\/\\\\/g;
|
||||
$negname = $escname;
|
||||
if ($negname =~ /:/) {
|
||||
$negname =~ s/:/:^/;
|
||||
} else {
|
||||
$negname =~ y/a-z/A-Z/;
|
||||
}
|
||||
return "{ \"$escname\", +1, code$cname, $n }", "{ \"$negname\", -1, code$cname, $n }";
|
||||
}
|
||||
|
||||
my $gen = 0;
|
||||
|
||||
sub PrintClasses($@) {
|
||||
my ($cname, @classes) = @_;
|
||||
my @entries;
|
||||
foreach my $cl (@classes) {
|
||||
my @ranges = ComputeClass($cl);
|
||||
push @entries, PrintClass(++$gen, $cl, @ranges);
|
||||
}
|
||||
print "UGroup ${cname}_groups[] = {\n";
|
||||
foreach my $e (@entries) {
|
||||
print "\t$e,\n";
|
||||
}
|
||||
print "};\n";
|
||||
my $count = @entries;
|
||||
print "int num_${cname}_groups = $count;\n";
|
||||
}
|
||||
|
||||
print <<EOF;
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
EOF
|
||||
|
||||
PrintClasses("perl", @perlclasses);
|
||||
PrintClasses("posix", @posixclasses);
|
||||
|
||||
print <<EOF;
|
||||
|
||||
} // namespace re2
|
||||
EOF
|
|
@ -0,0 +1,146 @@
|
|||
#!/usr/bin/python
|
||||
# coding=utf-8
|
||||
#
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# See unicode_casefold.h for description of case folding tables.
|
||||
|
||||
"""Generate C++ table for Unicode case folding."""
|
||||
|
||||
import unicode, sys
|
||||
|
||||
_header = """
|
||||
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
||||
// make_unicode_casefold.py >unicode_casefold.cc
|
||||
|
||||
#include "re2/unicode_casefold.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
"""
|
||||
|
||||
_trailer = """
|
||||
|
||||
} // namespace re2
|
||||
|
||||
"""
|
||||
|
||||
def _Delta(a, b):
|
||||
"""Compute the delta for b - a. Even/odd and odd/even
|
||||
are handled specially, as described above."""
|
||||
if a+1 == b:
|
||||
if a%2 == 0:
|
||||
return 'EvenOdd'
|
||||
else:
|
||||
return 'OddEven'
|
||||
if a == b+1:
|
||||
if a%2 == 0:
|
||||
return 'OddEven'
|
||||
else:
|
||||
return 'EvenOdd'
|
||||
return b - a
|
||||
|
||||
def _AddDelta(a, delta):
|
||||
"""Return a + delta, handling EvenOdd and OddEven specially."""
|
||||
if type(delta) == int:
|
||||
return a+delta
|
||||
if delta == 'EvenOdd':
|
||||
if a%2 == 0:
|
||||
return a+1
|
||||
else:
|
||||
return a-1
|
||||
if delta == 'OddEven':
|
||||
if a%2 == 1:
|
||||
return a+1
|
||||
else:
|
||||
return a-1
|
||||
print >>sys.stderr, "Bad Delta: ", delta
|
||||
raise "Bad Delta"
|
||||
|
||||
def _MakeRanges(pairs):
|
||||
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
|
||||
into [(65, 90, +32)]."""
|
||||
ranges = []
|
||||
last = -100
|
||||
|
||||
def evenodd(last, a, b, r):
|
||||
if a != last+1 or b != _AddDelta(a, r[2]):
|
||||
return False
|
||||
r[1] = a
|
||||
return True
|
||||
|
||||
def evenoddpair(last, a, b, r):
|
||||
if a != last+2:
|
||||
return False
|
||||
delta = r[2]
|
||||
d = delta
|
||||
if type(delta) is not str:
|
||||
return False
|
||||
if delta.endswith('Skip'):
|
||||
d = delta[:-4]
|
||||
else:
|
||||
delta = d + 'Skip'
|
||||
if b != _AddDelta(a, d):
|
||||
return False
|
||||
r[1] = a
|
||||
r[2] = delta
|
||||
return True
|
||||
|
||||
for a, b in pairs:
|
||||
if ranges and evenodd(last, a, b, ranges[-1]):
|
||||
pass
|
||||
elif ranges and evenoddpair(last, a, b, ranges[-1]):
|
||||
pass
|
||||
else:
|
||||
ranges.append([a, a, _Delta(a, b)])
|
||||
last = a
|
||||
return ranges
|
||||
|
||||
# The maximum size of a case-folding group.
|
||||
# Case folding is implemented in parse.cc by a recursive process
|
||||
# with a recursion depth equal to the size of the largest
|
||||
# case-folding group, so it is important that this bound be small.
|
||||
# The current tables have no group bigger than 4.
|
||||
# If there are ever groups bigger than 10 or so, it will be
|
||||
# time to rework the code in parse.cc.
|
||||
MaxCasefoldGroup = 4
|
||||
|
||||
def main():
|
||||
lowergroups, casegroups = unicode.CaseGroups()
|
||||
foldpairs = []
|
||||
seen = {}
|
||||
for c in casegroups:
|
||||
if len(c) > MaxCasefoldGroup:
|
||||
raise unicode.Error("casefold group too long: %s" % (c,))
|
||||
for i in range(len(c)):
|
||||
if c[i-1] in seen:
|
||||
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
|
||||
seen[c[i-1]] = True
|
||||
foldpairs.append([c[i-1], c[i]])
|
||||
|
||||
lowerpairs = []
|
||||
for lower, group in lowergroups.iteritems():
|
||||
for g in group:
|
||||
if g != lower:
|
||||
lowerpairs.append([g, lower])
|
||||
|
||||
def printpairs(name, foldpairs):
|
||||
foldpairs.sort()
|
||||
foldranges = _MakeRanges(foldpairs)
|
||||
print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
|
||||
print "CaseFold unicode_%s[] = {" % (name,)
|
||||
for lo, hi, delta in foldranges:
|
||||
print "\t{ %d, %d, %s }," % (lo, hi, delta)
|
||||
print "};"
|
||||
print "int num_unicode_%s = %d;" % (name, len(foldranges),)
|
||||
print ""
|
||||
|
||||
print _header
|
||||
printpairs("casefold", foldpairs)
|
||||
printpairs("tolower", lowerpairs)
|
||||
print _trailer
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,111 @@
|
|||
#!/usr/bin/python
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
"""Generate C++ tables for Unicode Script and Category groups."""
|
||||
|
||||
import sys
|
||||
import unicode
|
||||
|
||||
_header = """
|
||||
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
|
||||
// make_unicode_groups.py >unicode_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
"""
|
||||
|
||||
_trailer = """
|
||||
|
||||
} // namespace re2
|
||||
|
||||
"""
|
||||
|
||||
n16 = 0
|
||||
n32 = 0
|
||||
|
||||
def MakeRanges(codes):
|
||||
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
|
||||
ranges = []
|
||||
last = -100
|
||||
for c in codes:
|
||||
if c == last+1:
|
||||
ranges[-1][1] = c
|
||||
else:
|
||||
ranges.append([c, c])
|
||||
last = c
|
||||
return ranges
|
||||
|
||||
def PrintRanges(type, name, ranges):
|
||||
"""Print the ranges as an array of type named name."""
|
||||
print "static %s %s[] = {" % (type, name,)
|
||||
for lo, hi in ranges:
|
||||
print "\t{ %d, %d }," % (lo, hi)
|
||||
print "};"
|
||||
|
||||
# def PrintCodes(type, name, codes):
|
||||
# """Print the codes as an array of type named name."""
|
||||
# print "static %s %s[] = {" % (type, name,)
|
||||
# for c in codes:
|
||||
# print "\t%d," % (c,)
|
||||
# print "};"
|
||||
|
||||
def PrintGroup(name, codes):
|
||||
"""Print the data structures for the group of codes.
|
||||
Return a UGroup literal for the group."""
|
||||
|
||||
# See unicode_groups.h for a description of the data structure.
|
||||
|
||||
# Split codes into 16-bit ranges and 32-bit ranges.
|
||||
range16 = MakeRanges([c for c in codes if c < 65536])
|
||||
range32 = MakeRanges([c for c in codes if c >= 65536])
|
||||
|
||||
# Pull singleton ranges out of range16.
|
||||
# code16 = [lo for lo, hi in range16 if lo == hi]
|
||||
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
|
||||
|
||||
global n16
|
||||
global n32
|
||||
n16 += len(range16)
|
||||
n32 += len(range32)
|
||||
|
||||
ugroup = "{ \"%s\", +1" % (name,)
|
||||
# if len(code16) > 0:
|
||||
# PrintCodes("uint16", name+"_code16", code16)
|
||||
# ugroup += ", %s_code16, %d" % (name, len(code16))
|
||||
# else:
|
||||
# ugroup += ", 0, 0"
|
||||
if len(range16) > 0:
|
||||
PrintRanges("URange16", name+"_range16", range16)
|
||||
ugroup += ", %s_range16, %d" % (name, len(range16))
|
||||
else:
|
||||
ugroup += ", 0, 0"
|
||||
if len(range32) > 0:
|
||||
PrintRanges("URange32", name+"_range32", range32)
|
||||
ugroup += ", %s_range32, %d" % (name, len(range32))
|
||||
else:
|
||||
ugroup += ", 0, 0"
|
||||
ugroup += " }"
|
||||
return ugroup
|
||||
|
||||
def main():
|
||||
print _header
|
||||
ugroups = []
|
||||
for name, codes in unicode.Categories().iteritems():
|
||||
ugroups.append(PrintGroup(name, codes))
|
||||
for name, codes in unicode.Scripts().iteritems():
|
||||
ugroups.append(PrintGroup(name, codes))
|
||||
print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
|
||||
print "UGroup unicode_groups[] = {";
|
||||
ugroups.sort()
|
||||
for ug in ugroups:
|
||||
print "\t%s," % (ug,)
|
||||
print "};"
|
||||
print "int num_unicode_groups = %d;" % (len(ugroups),)
|
||||
print _trailer
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,185 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Determine whether this library should match PCRE exactly
|
||||
// for a particular Regexp. (If so, the testing framework can
|
||||
// check that it does.)
|
||||
//
|
||||
// This library matches PCRE except in these cases:
|
||||
// * the regexp contains a repetition of an empty string,
|
||||
// like (a*)* or (a*)+. In this case, PCRE will treat
|
||||
// the repetition sequence as ending with an empty string,
|
||||
// while this library does not.
|
||||
// * Perl and PCRE differ on whether \v matches \n.
|
||||
// For historical reasons, this library implements the Perl behavior.
|
||||
// * Perl and PCRE allow $ in one-line mode to match either the very
|
||||
// end of the text or just before a \n at the end of the text.
|
||||
// This library requires it to match only the end of the text.
|
||||
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
||||
// match the end of the text if the last character is a \n.
|
||||
// This library does allow it.
|
||||
//
|
||||
// Regexp::MimicsPCRE checks for any of these conditions.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Returns whether re might match an empty string.
|
||||
static bool CanBeEmptyString(Regexp *re);
|
||||
|
||||
// Walker class to compute whether library handles a regexp
|
||||
// exactly as PCRE would. See comment at top for conditions.
|
||||
|
||||
class PCREWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
PCREWalker() {}
|
||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
|
||||
int nchild_args);
|
||||
|
||||
bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
// Called after visiting each of re's children and accumulating
|
||||
// the return values in child_args. So child_args contains whether
|
||||
// this library mimics PCRE for those subexpressions.
|
||||
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
// If children failed, so do we.
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (!child_args[i])
|
||||
return false;
|
||||
|
||||
// Otherwise look for other reasons to fail.
|
||||
switch (re->op()) {
|
||||
// Look for repeated empty string.
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
if (CanBeEmptyString(re->sub()[0]))
|
||||
return false;
|
||||
break;
|
||||
case kRegexpRepeat:
|
||||
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for \v
|
||||
case kRegexpLiteral:
|
||||
if (re->rune() == '\v')
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for $ in single-line mode.
|
||||
case kRegexpEndText:
|
||||
case kRegexpEmptyMatch:
|
||||
if (re->parse_flags() & Regexp::WasDollar)
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for ^ in multi-line mode.
|
||||
case kRegexpBeginLine:
|
||||
// No condition: in single-line mode ^ becomes kRegexpBeginText.
|
||||
return false;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// Not proven guilty.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
||||
bool Regexp::MimicsPCRE() {
|
||||
PCREWalker w;
|
||||
return w.Walk(this, true);
|
||||
}
|
||||
|
||||
|
||||
// Walker class to compute whether a Regexp can match an empty string.
|
||||
// It is okay to overestimate. For example, \b\B cannot match an empty
|
||||
// string, because \b and \B are mutually exclusive, but this isn't
|
||||
// that smart and will say it can. Spurious empty strings
|
||||
// will reduce the number of regexps we sanity check against PCRE,
|
||||
// but they won't break anything.
|
||||
|
||||
class EmptyStringWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
EmptyStringWalker() { }
|
||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args);
|
||||
|
||||
bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
||||
return a;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
|
||||
};
|
||||
|
||||
// Called after visiting re's children. child_args contains the return
|
||||
// value from each of the children's PostVisits (i.e., whether each child
|
||||
// can match an empty string). Returns whether this clause can match an
|
||||
// empty string.
|
||||
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch: // never empty
|
||||
case kRegexpLiteral:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpLiteralString:
|
||||
return false;
|
||||
|
||||
case kRegexpEmptyMatch: // always empty
|
||||
case kRegexpBeginLine: // always empty, when they match
|
||||
case kRegexpEndLine:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpEndText:
|
||||
case kRegexpStar: // can always be empty
|
||||
case kRegexpQuest:
|
||||
case kRegexpHaveMatch:
|
||||
return true;
|
||||
|
||||
case kRegexpConcat: // can be empty if all children can
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (!child_args[i])
|
||||
return false;
|
||||
return true;
|
||||
|
||||
case kRegexpAlternate: // can be empty if any child can
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (child_args[i])
|
||||
return true;
|
||||
return false;
|
||||
|
||||
case kRegexpPlus: // can be empty if the child can
|
||||
case kRegexpCapture:
|
||||
return child_args[0];
|
||||
|
||||
case kRegexpRepeat: // can be empty if child can or is x{0}
|
||||
return child_args[0] || re->min() == 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns whether re can match an empty string.
|
||||
static bool CanBeEmptyString(Regexp* re) {
|
||||
EmptyStringWalker w;
|
||||
return w.Walk(re, true);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,709 @@
|
|||
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc.
|
||||
//
|
||||
// Prog::SearchNFA, an NFA search.
|
||||
// This is an actual NFA like the theorists talk about,
|
||||
// not the pseudo-NFA found in backtracking regexp implementations.
|
||||
//
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
|
||||
// which is a variant of the one described in Thompson's 1968 CACM paper.
|
||||
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
|
||||
// over the DFA implementation is that it tracks submatch boundaries.
|
||||
//
|
||||
// When the choice of submatch boundaries is ambiguous, this particular
|
||||
// implementation makes the same choices that traditional backtracking
|
||||
// implementations (in particular, Perl and PCRE) do.
|
||||
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
|
||||
// time in the length of the input.
|
||||
//
|
||||
// Like Thompson's original machine and like the DFA implementation, this
|
||||
// implementation notices a match only once it is one byte past it.
|
||||
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "util/sparse_array.h"
|
||||
#include "util/sparse_set.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class NFA {
|
||||
public:
|
||||
NFA(Prog* prog);
|
||||
~NFA();
|
||||
|
||||
// Searches for a matching string.
|
||||
// * If anchored is true, only considers matches starting at offset.
|
||||
// Otherwise finds lefmost match at or after offset.
|
||||
// * If longest is true, returns the longest match starting
|
||||
// at the chosen start point. Otherwise returns the so-called
|
||||
// left-biased match, the one traditional backtracking engines
|
||||
// (like Perl and PCRE) find.
|
||||
// Records submatch boundaries in submatch[1..nsubmatch-1].
|
||||
// Submatch[0] is the entire match. When there is a choice in
|
||||
// which text matches each subexpression, the submatch boundaries
|
||||
// are chosen to match what a backtracking implementation would choose.
|
||||
bool Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
static const int Debug = 0;
|
||||
|
||||
private:
|
||||
struct Thread {
|
||||
union {
|
||||
int id;
|
||||
Thread* next; // when on free list
|
||||
};
|
||||
const char** capture;
|
||||
};
|
||||
|
||||
// State for explicit stack in AddToThreadq.
|
||||
struct AddState {
|
||||
int id; // Inst to process
|
||||
int j;
|
||||
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
|
||||
|
||||
AddState()
|
||||
: id(0), j(-1), cap_j(NULL) {}
|
||||
explicit AddState(int id)
|
||||
: id(id), j(-1), cap_j(NULL) {}
|
||||
AddState(int id, const char* cap_j, int j)
|
||||
: id(id), j(j), cap_j(cap_j) {}
|
||||
};
|
||||
|
||||
// Threadq is a list of threads. The list is sorted by the order
|
||||
// in which Perl would explore that particular state -- the earlier
|
||||
// choices appear earlier in the list.
|
||||
typedef SparseArray<Thread*> Threadq;
|
||||
|
||||
inline Thread* AllocThread();
|
||||
inline void FreeThread(Thread*);
|
||||
|
||||
// Add r (or its children, following unlabeled arrows)
|
||||
// to the workqueue q with associated capture info.
|
||||
void AddToThreadq(Threadq* q, int id, int flag,
|
||||
const char* p, const char** capture);
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates matched_ and match_ as new, better matches are found.
|
||||
// p is position of the next byte (the one after c)
|
||||
// in the input string, used when processing capturing parens.
|
||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input point (after c).
|
||||
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
|
||||
|
||||
// Returns text version of capture information, for debugging.
|
||||
string FormatCapture(const char** capture);
|
||||
|
||||
inline void CopyCapture(const char** dst, const char** src);
|
||||
|
||||
// Computes whether all matches must begin with the same first
|
||||
// byte, and if so, returns that byte. If not, returns -1.
|
||||
int ComputeFirstByte();
|
||||
|
||||
Prog* prog_; // underlying program
|
||||
int start_; // start instruction in program
|
||||
int ncapture_; // number of submatches to track
|
||||
bool longest_; // whether searching for longest match
|
||||
bool endmatch_; // whether match must end at text.end()
|
||||
const char* btext_; // beginning of text being matched (for FormatSubmatch)
|
||||
const char* etext_; // end of text being matched (for endmatch_)
|
||||
Threadq q0_, q1_; // pre-allocated for Search.
|
||||
const char** match_; // best match so far
|
||||
bool matched_; // any match so far?
|
||||
AddState* astack_; // pre-allocated for AddToThreadq
|
||||
int nastack_;
|
||||
int first_byte_; // required first byte for match, or -1 if none
|
||||
|
||||
Thread* free_threads_; // free list
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NFA);
|
||||
};
|
||||
|
||||
NFA::NFA(Prog* prog) {
|
||||
prog_ = prog;
|
||||
start_ = prog->start();
|
||||
ncapture_ = 0;
|
||||
longest_ = false;
|
||||
endmatch_ = false;
|
||||
btext_ = NULL;
|
||||
etext_ = NULL;
|
||||
q0_.resize(prog_->size());
|
||||
q1_.resize(prog_->size());
|
||||
nastack_ = 2*prog_->size();
|
||||
astack_ = new AddState[nastack_];
|
||||
match_ = NULL;
|
||||
matched_ = false;
|
||||
free_threads_ = NULL;
|
||||
first_byte_ = ComputeFirstByte();
|
||||
}
|
||||
|
||||
NFA::~NFA() {
|
||||
delete[] match_;
|
||||
delete[] astack_;
|
||||
Thread* next;
|
||||
for (Thread* t = free_threads_; t; t = next) {
|
||||
next = t->next;
|
||||
delete[] t->capture;
|
||||
delete t;
|
||||
}
|
||||
}
|
||||
|
||||
void NFA::FreeThread(Thread *t) {
|
||||
if (t == NULL)
|
||||
return;
|
||||
t->next = free_threads_;
|
||||
free_threads_ = t;
|
||||
}
|
||||
|
||||
NFA::Thread* NFA::AllocThread() {
|
||||
Thread* t = free_threads_;
|
||||
if (t == NULL) {
|
||||
t = new Thread;
|
||||
t->capture = new const char*[ncapture_];
|
||||
return t;
|
||||
}
|
||||
free_threads_ = t->next;
|
||||
return t;
|
||||
}
|
||||
|
||||
void NFA::CopyCapture(const char** dst, const char** src) {
|
||||
for (int i = 0; i < ncapture_; i+=2) {
|
||||
dst[i] = src[i];
|
||||
dst[i+1] = src[i+1];
|
||||
}
|
||||
}
|
||||
|
||||
// Follows all empty arrows from r and enqueues all the states reached.
|
||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
||||
// The pointer p is the current input position, and m is the
|
||||
// current set of match boundaries.
|
||||
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
|
||||
const char* p, const char** capture) {
|
||||
if (id0 == 0)
|
||||
return;
|
||||
|
||||
// Astack_ is pre-allocated to avoid resize operations.
|
||||
// It has room for 2*prog_->size() entries, which is enough:
|
||||
// Each inst in prog can be processed at most once,
|
||||
// pushing at most two entries on stk.
|
||||
|
||||
int nstk = 0;
|
||||
AddState* stk = astack_;
|
||||
stk[nstk++] = AddState(id0);
|
||||
|
||||
while (nstk > 0) {
|
||||
DCHECK_LE(nstk, nastack_);
|
||||
const AddState& a = stk[--nstk];
|
||||
if (a.j >= 0)
|
||||
capture[a.j] = a.cap_j;
|
||||
|
||||
int id = a.id;
|
||||
if (id == 0)
|
||||
continue;
|
||||
if (q->has_index(id)) {
|
||||
if (Debug)
|
||||
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create entry in q no matter what. We might fill it in below,
|
||||
// or we might not. Even if not, it is necessary to have it,
|
||||
// so that we don't revisit r during the recursion.
|
||||
q->set_new(id, NULL);
|
||||
|
||||
Thread** tp = &q->find(id)->second;
|
||||
int j;
|
||||
Thread* t;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
// Save state; will pick up at next byte.
|
||||
t = AllocThread();
|
||||
t->id = id;
|
||||
CopyCapture(t->capture, capture);
|
||||
*tp = t;
|
||||
// fall through
|
||||
|
||||
case kInstAlt:
|
||||
// Explore alternatives.
|
||||
stk[nstk++] = AddState(ip->out1());
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
|
||||
case kInstNop:
|
||||
// Continue on.
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
if ((j=ip->cap()) < ncapture_) {
|
||||
// Push a dummy whose only job is to restore capture[j]
|
||||
// once we finish exploring this possibility.
|
||||
stk[nstk++] = AddState(0, capture[j], j);
|
||||
|
||||
// Record capture.
|
||||
capture[j] = p;
|
||||
}
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstByteRange:
|
||||
// Save state; will pick up at next byte.
|
||||
t = AllocThread();
|
||||
t->id = id;
|
||||
CopyCapture(t->capture, capture);
|
||||
*tp = t;
|
||||
if (Debug)
|
||||
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
|
||||
break;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
// Continue on if we have all the right flag bits.
|
||||
if (ip->empty() & ~flag)
|
||||
break;
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates match as new, better matches are found.
|
||||
// p is position of the byte c in the input string,
|
||||
// used when processing capturing parens.
|
||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input point (after c).
|
||||
// Frees all the threads on runq.
|
||||
// If there is a shortcut to the end, returns that shortcut.
|
||||
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||
nextq->clear();
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->second;
|
||||
if (t == NULL)
|
||||
continue;
|
||||
|
||||
if (longest_) {
|
||||
// Can skip any threads started after our current best match.
|
||||
if (matched_ && match_[0] < t->capture[0]) {
|
||||
FreeThread(t);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
int id = t->id;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
// Should only see the values handled below.
|
||||
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
if (ip->Matches(c))
|
||||
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
if (i != runq->begin())
|
||||
break;
|
||||
// The match is ours if we want it.
|
||||
if (ip->greedy(prog_) || longest_) {
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
FreeThread(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
runq->clear();
|
||||
matched_ = true;
|
||||
if (ip->greedy(prog_))
|
||||
return ip->out1();
|
||||
return ip->out();
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (endmatch_ && p != etext_)
|
||||
break;
|
||||
|
||||
const char* old = t->capture[1]; // previous end pointer
|
||||
t->capture[1] = p;
|
||||
if (longest_) {
|
||||
// Leftmost-longest mode: save this match only if
|
||||
// it is either farther to the left or at the same
|
||||
// point but longer than an existing match.
|
||||
if (!matched_ || t->capture[0] < match_[0] ||
|
||||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
} else {
|
||||
// Leftmost-biased mode: this match is by definition
|
||||
// better than what we've already found (see next line).
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
|
||||
// Cut off the threads that can only find matches
|
||||
// worse than the one we just found: don't run the
|
||||
// rest of the current Threadq.
|
||||
t->capture[0] = old;
|
||||
FreeThread(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
runq->clear();
|
||||
matched_ = true;
|
||||
return 0;
|
||||
}
|
||||
t->capture[0] = old;
|
||||
matched_ = true;
|
||||
break;
|
||||
}
|
||||
FreeThread(t);
|
||||
}
|
||||
runq->clear();
|
||||
return 0;
|
||||
}
|
||||
|
||||
string NFA::FormatCapture(const char** capture) {
|
||||
string s;
|
||||
|
||||
for (int i = 0; i < ncapture_; i+=2) {
|
||||
if (capture[i] == NULL)
|
||||
StringAppendF(&s, "(?,?)");
|
||||
else if (capture[i+1] == NULL)
|
||||
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
|
||||
else
|
||||
StringAppendF(&s, "(%d,%d)",
|
||||
(int)(capture[i] - btext_),
|
||||
(int)(capture[i+1] - btext_));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Returns whether haystack contains needle's memory.
|
||||
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
|
||||
return haystack.begin() <= needle.begin() &&
|
||||
haystack.end() >= needle.end();
|
||||
}
|
||||
|
||||
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
if (start_ == 0)
|
||||
return false;
|
||||
|
||||
StringPiece context = const_context;
|
||||
if (context.begin() == NULL)
|
||||
context = text;
|
||||
|
||||
if (!StringPieceContains(context, text)) {
|
||||
LOG(FATAL) << "Bad args: context does not contain text "
|
||||
<< reinterpret_cast<const void*>(context.begin())
|
||||
<< "+" << context.size() << " "
|
||||
<< reinterpret_cast<const void*>(text.begin())
|
||||
<< "+" << text.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (prog_->anchor_start() && context.begin() != text.begin())
|
||||
return false;
|
||||
if (prog_->anchor_end() && context.end() != text.end())
|
||||
return false;
|
||||
anchored |= prog_->anchor_start();
|
||||
if (prog_->anchor_end()) {
|
||||
longest = true;
|
||||
endmatch_ = true;
|
||||
etext_ = text.end();
|
||||
}
|
||||
|
||||
if (nsubmatch < 0) {
|
||||
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Save search parameters.
|
||||
ncapture_ = 2*nsubmatch;
|
||||
longest_ = longest;
|
||||
|
||||
if (nsubmatch == 0) {
|
||||
// We need to maintain match[0], both to distinguish the
|
||||
// longest match (if longest is true) and also to tell
|
||||
// whether we've seen any matches at all.
|
||||
ncapture_ = 2;
|
||||
}
|
||||
|
||||
match_ = new const char*[ncapture_];
|
||||
matched_ = false;
|
||||
memset(match_, 0, ncapture_*sizeof match_[0]);
|
||||
|
||||
// For debugging prints.
|
||||
btext_ = context.begin();
|
||||
|
||||
if (Debug) {
|
||||
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
||||
text.as_string().c_str(), context.as_string().c_str(), anchored,
|
||||
longest);
|
||||
}
|
||||
|
||||
// Set up search.
|
||||
Threadq* runq = &q0_;
|
||||
Threadq* nextq = &q1_;
|
||||
runq->clear();
|
||||
nextq->clear();
|
||||
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
|
||||
const char* bp = context.begin();
|
||||
int c = -1;
|
||||
int wasword = 0;
|
||||
|
||||
if (text.begin() > context.begin()) {
|
||||
c = text.begin()[-1] & 0xFF;
|
||||
wasword = Prog::IsWordChar(c);
|
||||
}
|
||||
|
||||
// Loop over the text, stepping the machine.
|
||||
for (const char* p = text.begin();; p++) {
|
||||
// Check for empty-width specials.
|
||||
int flag = 0;
|
||||
|
||||
// ^ and \A
|
||||
if (p == context.begin())
|
||||
flag |= kEmptyBeginText | kEmptyBeginLine;
|
||||
else if (p <= context.end() && p[-1] == '\n')
|
||||
flag |= kEmptyBeginLine;
|
||||
|
||||
// $ and \z
|
||||
if (p == context.end())
|
||||
flag |= kEmptyEndText | kEmptyEndLine;
|
||||
else if (p < context.end() && p[0] == '\n')
|
||||
flag |= kEmptyEndLine;
|
||||
|
||||
// \b and \B
|
||||
int isword = 0;
|
||||
if (p < context.end())
|
||||
isword = Prog::IsWordChar(p[0] & 0xFF);
|
||||
|
||||
if (isword != wasword)
|
||||
flag |= kEmptyWordBoundary;
|
||||
else
|
||||
flag |= kEmptyNonWordBoundary;
|
||||
|
||||
if (Debug) {
|
||||
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->second;
|
||||
if (t == NULL)
|
||||
continue;
|
||||
fprintf(stderr, " %d%s", t->id,
|
||||
FormatCapture((const char**)t->capture).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Process previous character (waited until now to avoid
|
||||
// repeating the flag computation above).
|
||||
// This is a no-op the first time around the loop, because
|
||||
// runq is empty.
|
||||
int id = Step(runq, nextq, c, flag, p-1);
|
||||
DCHECK_EQ(runq->size(), 0);
|
||||
swap(nextq, runq);
|
||||
nextq->clear();
|
||||
if (id != 0) {
|
||||
// We're done: full match ahead.
|
||||
p = text.end();
|
||||
for (;;) {
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
match_[ip->cap()] = p;
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
case kInstMatch:
|
||||
match_[1] = p;
|
||||
matched_ = true;
|
||||
break;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
|
||||
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
|
||||
break;
|
||||
}
|
||||
id = ip->out();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (p > text.end())
|
||||
break;
|
||||
|
||||
// Start a new thread if there have not been any matches.
|
||||
// (No point in starting a new thread if there have been
|
||||
// matches, since it would be to the right of the match
|
||||
// we already found.)
|
||||
if (!matched_ && (!anchored || p == text.begin())) {
|
||||
// If there's a required first byte for an unanchored search
|
||||
// and we're not in the middle of any possible matches,
|
||||
// use memchr to search for the byte quickly.
|
||||
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
|
||||
p < text.end() && (p[0] & 0xFF) != first_byte_) {
|
||||
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
|
||||
text.end() - p));
|
||||
if (p == NULL) {
|
||||
p = text.end();
|
||||
isword = 0;
|
||||
} else {
|
||||
isword = Prog::IsWordChar(p[0] & 0xFF);
|
||||
}
|
||||
flag = Prog::EmptyFlags(context, p);
|
||||
}
|
||||
|
||||
// Steal match storage (cleared but unused as of yet)
|
||||
// temporarily to hold match boundaries for new thread.
|
||||
match_[0] = p;
|
||||
AddToThreadq(runq, start_, flag, p, match_);
|
||||
match_[0] = NULL;
|
||||
}
|
||||
|
||||
// If all the threads have died, stop early.
|
||||
if (runq->size() == 0) {
|
||||
if (Debug)
|
||||
fprintf(stderr, "dead\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (p == text.end())
|
||||
c = 0;
|
||||
else
|
||||
c = *p & 0xFF;
|
||||
wasword = isword;
|
||||
|
||||
// Will run step(runq, nextq, c, ...) on next iteration. See above.
|
||||
}
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
|
||||
if (matched_) {
|
||||
for (int i = 0; i < nsubmatch; i++)
|
||||
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
|
||||
if (Debug)
|
||||
fprintf(stderr, "match (%d,%d)\n",
|
||||
static_cast<int>(match_[0] - btext_),
|
||||
static_cast<int>(match_[1] - btext_));
|
||||
return true;
|
||||
}
|
||||
VLOG(1) << "No matches found";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Computes whether all successful matches have a common first byte,
|
||||
// and if so, returns that byte. If not, returns -1.
|
||||
int NFA::ComputeFirstByte() {
|
||||
if (start_ == 0)
|
||||
return -1;
|
||||
|
||||
int b = -1; // first byte, not yet computed
|
||||
|
||||
typedef SparseSet Workq;
|
||||
Workq q(prog_->size());
|
||||
q.insert(start_);
|
||||
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
|
||||
int id = *it;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
// The empty string matches: no first byte.
|
||||
return -1;
|
||||
|
||||
case kInstByteRange:
|
||||
// Must match only a single byte
|
||||
if (ip->lo() != ip->hi())
|
||||
return -1;
|
||||
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
|
||||
return -1;
|
||||
// If we haven't seen any bytes yet, record it;
|
||||
// otherwise must match the one we saw before.
|
||||
if (b == -1)
|
||||
b = ip->lo();
|
||||
else if (b != ip->lo())
|
||||
return -1;
|
||||
break;
|
||||
|
||||
case kInstNop:
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
// Continue on.
|
||||
// Ignore ip->empty() flags for kInstEmptyWidth
|
||||
// in order to be as conservative as possible
|
||||
// (assume all possible empty-width flags are true).
|
||||
if (ip->out())
|
||||
q.insert(ip->out());
|
||||
break;
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
// Explore alternatives.
|
||||
if (ip->out())
|
||||
q.insert(ip->out());
|
||||
if (ip->out1())
|
||||
q.insert(ip->out1());
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
bool
|
||||
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch) {
|
||||
if (NFA::Debug)
|
||||
Dump();
|
||||
|
||||
NFA nfa(this);
|
||||
StringPiece sp;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch == 0) {
|
||||
match = &sp;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && match[0].end() != text.end())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,614 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc.
|
||||
//
|
||||
// Prog::SearchOnePass is an efficient implementation of
|
||||
// regular expression search with submatch tracking for
|
||||
// what I call "one-pass regular expressions". (An alternate
|
||||
// name might be "backtracking-free regular expressions".)
|
||||
//
|
||||
// One-pass regular expressions have the property that
|
||||
// at each input byte during an anchored match, there may be
|
||||
// multiple alternatives but only one can proceed for any
|
||||
// given input byte.
|
||||
//
|
||||
// For example, the regexp /x*yx*/ is one-pass: you read
|
||||
// x's until a y, then you read the y, then you keep reading x's.
|
||||
// At no point do you have to guess what to do or back up
|
||||
// and try a different guess.
|
||||
//
|
||||
// On the other hand, /x*x/ is not one-pass: when you're
|
||||
// looking at an input "x", it's not clear whether you should
|
||||
// use it to extend the x* or as the final x.
|
||||
//
|
||||
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
|
||||
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
|
||||
//
|
||||
// A simple intuition for identifying one-pass regular expressions
|
||||
// is that it's always immediately obvious when a repetition ends.
|
||||
// It must also be immediately obvious which branch of an | to take:
|
||||
//
|
||||
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
|
||||
//
|
||||
// The NFA-based search in nfa.cc does some bookkeeping to
|
||||
// avoid the need for backtracking and its associated exponential blowup.
|
||||
// But if we have a one-pass regular expression, there is no
|
||||
// possibility of backtracking, so there is no need for the
|
||||
// extra bookkeeping. Hence, this code.
|
||||
//
|
||||
// On a one-pass regular expression, the NFA code in nfa.cc
|
||||
// runs at about 1/20 of the backtracking-based PCRE speed.
|
||||
// In contrast, the code in this file runs at about the same
|
||||
// speed as PCRE.
|
||||
//
|
||||
// One-pass regular expressions get used a lot when RE is
|
||||
// used for parsing simple strings, so it pays off to
|
||||
// notice them and handle them efficiently.
|
||||
//
|
||||
// See also Anne Brüggemann-Klein and Derick Wood,
|
||||
// "One-unambiguous regular languages", Information and Computation 142(2).
|
||||
|
||||
#include <string.h>
|
||||
#include <map>
|
||||
#include "util/util.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const int Debug = 0;
|
||||
|
||||
// The key insight behind this implementation is that the
|
||||
// non-determinism in an NFA for a one-pass regular expression
|
||||
// is contained. To explain what that means, first a
|
||||
// refresher about what regular expression programs look like
|
||||
// and how the usual NFA execution runs.
|
||||
//
|
||||
// In a regular expression program, only the kInstByteRange
|
||||
// instruction processes an input byte c and moves on to the
|
||||
// next byte in the string (it does so if c is in the given range).
|
||||
// The kInstByteRange instructions correspond to literal characters
|
||||
// and character classes in the regular expression.
|
||||
//
|
||||
// The kInstAlt instructions are used as wiring to connect the
|
||||
// kInstByteRange instructions together in interesting ways when
|
||||
// implementing | + and *.
|
||||
// The kInstAlt instruction forks execution, like a goto that
|
||||
// jumps to ip->out() and ip->out1() in parallel. Each of the
|
||||
// resulting computation paths is called a thread.
|
||||
//
|
||||
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
|
||||
// are interesting in their own right but like kInstAlt they don't
|
||||
// advance the input pointer. Only kInstByteRange does.
|
||||
//
|
||||
// The automaton execution in nfa.cc runs all the possible
|
||||
// threads of execution in lock-step over the input. To process
|
||||
// a particular byte, each thread gets run until it either dies
|
||||
// or finds a kInstByteRange instruction matching the byte.
|
||||
// If the latter happens, the thread stops just past the
|
||||
// kInstByteRange instruction (at ip->out()) and waits for
|
||||
// the other threads to finish processing the input byte.
|
||||
// Then, once all the threads have processed that input byte,
|
||||
// the whole process repeats. The kInstAlt state instruction
|
||||
// might create new threads during input processing, but no
|
||||
// matter what, all the threads stop after a kInstByteRange
|
||||
// and wait for the other threads to "catch up".
|
||||
// Running in lock step like this ensures that the NFA reads
|
||||
// the input string only once.
|
||||
//
|
||||
// Each thread maintains its own set of capture registers
|
||||
// (the string positions at which it executed the kInstCapture
|
||||
// instructions corresponding to capturing parentheses in the
|
||||
// regular expression). Repeated copying of the capture registers
|
||||
// is the main performance bottleneck in the NFA implementation.
|
||||
//
|
||||
// A regular expression program is "one-pass" if, no matter what
|
||||
// the input string, there is only one thread that makes it
|
||||
// past a kInstByteRange instruction at each input byte. This means
|
||||
// that there is in some sense only one active thread throughout
|
||||
// the execution. Other threads might be created during the
|
||||
// processing of an input byte, but they are ephemeral: only one
|
||||
// thread is left to start processing the next input byte.
|
||||
// This is what I meant above when I said the non-determinism
|
||||
// was "contained".
|
||||
//
|
||||
// To execute a one-pass regular expression program, we can build
|
||||
// a DFA (no non-determinism) that has at most as many states as
|
||||
// the NFA (compare this to the possibly exponential number of states
|
||||
// in the general case). Each state records, for each possible
|
||||
// input byte, the next state along with the conditions required
|
||||
// before entering that state -- empty-width flags that must be true
|
||||
// and capture operations that must be performed. It also records
|
||||
// whether a set of conditions required to finish a match at that
|
||||
// point in the input rather than process the next byte.
|
||||
|
||||
// A state in the one-pass NFA (aka DFA) - just an array of actions.
|
||||
struct OneState;
|
||||
|
||||
// A state in the one-pass NFA - just an array of actions indexed
|
||||
// by the bytemap_[] of the next input byte. (The bytemap
|
||||
// maps next input bytes into equivalence classes, to reduce
|
||||
// the memory footprint.)
|
||||
struct OneState {
|
||||
uint32 matchcond; // conditions to match right now.
|
||||
uint32 action[1];
|
||||
};
|
||||
|
||||
// The uint32 conditions in the action are a combination of
|
||||
// condition and capture bits and the next state. The bottom 16 bits
|
||||
// are the condition and capture bits, and the top 16 are the index of
|
||||
// the next state.
|
||||
//
|
||||
// Bits 0-5 are the empty-width flags from prog.h.
|
||||
// Bit 6 is kMatchWins, which means the match takes
|
||||
// priority over moving to next in a first-match search.
|
||||
// The remaining bits mark capture registers that should
|
||||
// be set to the current input position. The capture bits
|
||||
// start at index 2, since the search loop can take care of
|
||||
// cap[0], cap[1] (the overall match position).
|
||||
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
|
||||
// No input position can satisfy both kEmptyWordBoundary
|
||||
// and kEmptyNonWordBoundary, so we can use that as a sentinel
|
||||
// instead of needing an extra bit.
|
||||
|
||||
static const int kIndexShift = 16; // number of bits below index
|
||||
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
||||
static const int kRealCapShift = kEmptyShift + 1;
|
||||
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
||||
|
||||
// Parameters used to skip over cap[0], cap[1].
|
||||
static const int kCapShift = kRealCapShift - 2;
|
||||
static const int kMaxCap = kRealMaxCap + 2;
|
||||
|
||||
static const uint32 kMatchWins = 1 << kEmptyShift;
|
||||
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
||||
|
||||
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
||||
|
||||
// Check, at compile time, that prog.h agrees with math above.
|
||||
// This function is never called.
|
||||
void OnePass_Checks() {
|
||||
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
||||
kEmptyShift_disagrees_with_kEmptyAllFlags);
|
||||
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
||||
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
|
||||
kMaxCap_disagrees_with_kMaxOnePassCapture);
|
||||
}
|
||||
|
||||
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
||||
uint32 satisfied = Prog::EmptyFlags(context, p);
|
||||
if (cond & kEmptyAllFlags & ~satisfied)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Apply the capture bits in cond, saving p to the appropriate
|
||||
// locations in cap[].
|
||||
static void ApplyCaptures(uint32 cond, const char* p,
|
||||
const char** cap, int ncap) {
|
||||
for (int i = 2; i < ncap; i++)
|
||||
if (cond & (1 << kCapShift << i))
|
||||
cap[i] = p;
|
||||
}
|
||||
|
||||
// Compute a node pointer.
|
||||
// Basically (OneState*)(nodes + statesize*nodeindex)
|
||||
// but the version with the C++ casts overflows 80 characters (and is ugly).
|
||||
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
|
||||
int nodeindex) {
|
||||
return reinterpret_cast<OneState*>(
|
||||
const_cast<uint8*>(nodes + statesize*nodeindex));
|
||||
}
|
||||
|
||||
bool Prog::SearchOnePass(const StringPiece& text,
|
||||
const StringPiece& const_context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch) {
|
||||
if (anchor != kAnchored && kind != kFullMatch) {
|
||||
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure we have at least cap[1],
|
||||
// because we use it to tell if we matched.
|
||||
int ncap = 2*nmatch;
|
||||
if (ncap < 2)
|
||||
ncap = 2;
|
||||
|
||||
const char* cap[kMaxCap];
|
||||
for (int i = 0; i < ncap; i++)
|
||||
cap[i] = NULL;
|
||||
|
||||
const char* matchcap[kMaxCap];
|
||||
for (int i = 0; i < ncap; i++)
|
||||
matchcap[i] = NULL;
|
||||
|
||||
StringPiece context = const_context;
|
||||
if (context.begin() == NULL)
|
||||
context = text;
|
||||
if (anchor_start() && context.begin() != text.begin())
|
||||
return false;
|
||||
if (anchor_end() && context.end() != text.end())
|
||||
return false;
|
||||
if (anchor_end())
|
||||
kind = kFullMatch;
|
||||
|
||||
// State and act are marked volatile to
|
||||
// keep the compiler from re-ordering the
|
||||
// memory accesses walking over the NFA.
|
||||
// This is worth about 5%.
|
||||
volatile OneState* state = onepass_start_;
|
||||
volatile uint8* nodes = onepass_nodes_;
|
||||
volatile uint32 statesize = onepass_statesize_;
|
||||
uint8* bytemap = bytemap_;
|
||||
const char* bp = text.begin();
|
||||
const char* ep = text.end();
|
||||
const char* p;
|
||||
bool matched = false;
|
||||
matchcap[0] = bp;
|
||||
cap[0] = bp;
|
||||
uint32 nextmatchcond = state->matchcond;
|
||||
for (p = bp; p < ep; p++) {
|
||||
int c = bytemap[*p & 0xFF];
|
||||
uint32 matchcond = nextmatchcond;
|
||||
uint32 cond = state->action[c];
|
||||
|
||||
// Determine whether we can reach act->next.
|
||||
// If so, advance state and nextmatchcond.
|
||||
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
||||
uint32 nextindex = cond >> kIndexShift;
|
||||
state = IndexToNode(nodes, statesize, nextindex);
|
||||
nextmatchcond = state->matchcond;
|
||||
} else {
|
||||
state = NULL;
|
||||
nextmatchcond = kImpossible;
|
||||
}
|
||||
|
||||
// This code section is carefully tuned.
|
||||
// The goto sequence is about 10% faster than the
|
||||
// obvious rewrite as a large if statement in the
|
||||
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
|
||||
|
||||
// Saving the match capture registers is expensive.
|
||||
// Is this intermediate match worth thinking about?
|
||||
|
||||
// Not if we want a full match.
|
||||
if (kind == kFullMatch)
|
||||
goto skipmatch;
|
||||
|
||||
// Not if it's impossible.
|
||||
if (matchcond == kImpossible)
|
||||
goto skipmatch;
|
||||
|
||||
// Not if the possible match is beaten by the certain
|
||||
// match at the next byte. When this test is useless
|
||||
// (e.g., HTTPPartialMatchRE2) it slows the loop by
|
||||
// about 10%, but when it avoids work (e.g., DotMatchRE2),
|
||||
// it cuts the loop execution by about 45%.
|
||||
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
|
||||
goto skipmatch;
|
||||
|
||||
// Finally, the match conditions must be satisfied.
|
||||
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
|
||||
for (int i = 2; i < 2*nmatch; i++)
|
||||
matchcap[i] = cap[i];
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
ApplyCaptures(matchcond, p, matchcap, ncap);
|
||||
matchcap[1] = p;
|
||||
matched = true;
|
||||
|
||||
// If we're in longest match mode, we have to keep
|
||||
// going and see if we find a longer match.
|
||||
// In first match mode, we can stop if the match
|
||||
// takes priority over the next state for this input byte.
|
||||
// That bit is per-input byte and thus in cond, not matchcond.
|
||||
if (kind == kFirstMatch && (cond & kMatchWins))
|
||||
goto done;
|
||||
}
|
||||
|
||||
skipmatch:
|
||||
if (state == NULL)
|
||||
goto done;
|
||||
if ((cond & kCapMask) && nmatch > 1)
|
||||
ApplyCaptures(cond, p, cap, ncap);
|
||||
}
|
||||
|
||||
// Look for match at end of input.
|
||||
{
|
||||
uint32 matchcond = state->matchcond;
|
||||
if (matchcond != kImpossible &&
|
||||
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
ApplyCaptures(matchcond, p, cap, ncap);
|
||||
for (int i = 2; i < ncap; i++)
|
||||
matchcap[i] = cap[i];
|
||||
matchcap[1] = p;
|
||||
matched = true;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
if (!matched)
|
||||
return false;
|
||||
for (int i = 0; i < nmatch; i++)
|
||||
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Analysis to determine whether a given regexp program is one-pass.
|
||||
|
||||
// If ip is not on workq, adds ip to work queue and returns true.
|
||||
// If ip is already on work queue, does nothing and returns false.
|
||||
// If ip is NULL, does nothing and returns true (pretends to add it).
|
||||
typedef SparseSet Instq;
|
||||
static bool AddQ(Instq *q, int id) {
|
||||
if (id == 0)
|
||||
return true;
|
||||
if (q->contains(id))
|
||||
return false;
|
||||
q->insert(id);
|
||||
return true;
|
||||
}
|
||||
|
||||
struct InstCond {
|
||||
int id;
|
||||
uint32 cond;
|
||||
};
|
||||
|
||||
// Returns whether this is a one-pass program; that is,
|
||||
// returns whether it is safe to use SearchOnePass on this program.
|
||||
// These conditions must be true for any instruction ip:
|
||||
//
|
||||
// (1) for any other Inst nip, there is at most one input-free
|
||||
// path from ip to nip.
|
||||
// (2) there is at most one kInstByte instruction reachable from
|
||||
// ip that matches any particular byte c.
|
||||
// (3) there is at most one input-free path from ip to a kInstMatch
|
||||
// instruction.
|
||||
//
|
||||
// This is actually just a conservative approximation: it might
|
||||
// return false when the answer is true, when kInstEmptyWidth
|
||||
// instructions are involved.
|
||||
// Constructs and saves corresponding one-pass NFA on success.
|
||||
bool Prog::IsOnePass() {
|
||||
if (did_onepass_)
|
||||
return onepass_start_ != NULL;
|
||||
did_onepass_ = true;
|
||||
|
||||
if (start() == 0) // no match
|
||||
return false;
|
||||
|
||||
// Steal memory for the one-pass NFA from the overall DFA budget.
|
||||
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
||||
// Limit max node count to 65000 as a conservative estimate to
|
||||
// avoid overflowing 16-bit node index in encoding.
|
||||
int maxnodes = 2 + byte_inst_count_;
|
||||
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
|
||||
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
||||
return false;
|
||||
|
||||
// Flood the graph starting at the start state, and check
|
||||
// that in each reachable state, each possible byte leads
|
||||
// to a unique next state.
|
||||
int size = this->size();
|
||||
InstCond *stack = new InstCond[size];
|
||||
|
||||
int* nodebyid = new int[size]; // indexed by ip
|
||||
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
||||
|
||||
uint8* nodes = new uint8[maxnodes*statesize];
|
||||
uint8* nodep = nodes;
|
||||
|
||||
Instq tovisit(size), workq(size);
|
||||
AddQ(&tovisit, start());
|
||||
nodebyid[start()] = 0;
|
||||
nodep += statesize;
|
||||
int nalloc = 1;
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
||||
|
||||
// Flood graph using manual stack, filling in actions as found.
|
||||
// Default is none.
|
||||
for (int b = 0; b < bytemap_range_; b++)
|
||||
node->action[b] = kImpossible;
|
||||
node->matchcond = kImpossible;
|
||||
|
||||
workq.clear();
|
||||
bool matched = false;
|
||||
int nstack = 0;
|
||||
stack[nstack].id = id;
|
||||
stack[nstack++].cond = 0;
|
||||
while (nstack > 0) {
|
||||
int id = stack[--nstack].id;
|
||||
Prog::Inst* ip = inst(id);
|
||||
uint32 cond = stack[nstack].cond;
|
||||
switch (ip->opcode()) {
|
||||
case kInstAltMatch:
|
||||
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
||||
// Should implement it in this engine, but it's subtle.
|
||||
// Fall through.
|
||||
case kInstAlt:
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
|
||||
goto fail;
|
||||
stack[nstack].id = ip->out1();
|
||||
stack[nstack++].cond = cond;
|
||||
stack[nstack].id = ip->out();
|
||||
stack[nstack++].cond = cond;
|
||||
break;
|
||||
|
||||
case kInstByteRange: {
|
||||
int nextindex = nodebyid[ip->out()];
|
||||
if (nextindex == -1) {
|
||||
if (nalloc >= maxnodes) {
|
||||
if (Debug)
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: hit node limit %d > %d",
|
||||
nalloc, maxnodes);
|
||||
goto fail;
|
||||
}
|
||||
nextindex = nalloc;
|
||||
nodep += statesize;
|
||||
nodebyid[ip->out()] = nextindex;
|
||||
nalloc++;
|
||||
AddQ(&tovisit, ip->out());
|
||||
}
|
||||
if (matched)
|
||||
cond |= kMatchWins;
|
||||
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
||||
int b = bytemap_[c];
|
||||
c = unbytemap_[b]; // last c in byte class
|
||||
uint32 act = node->action[b];
|
||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (Debug) {
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: conflict on byte "
|
||||
"%#x at state %d",
|
||||
c, *it);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
if (ip->foldcase()) {
|
||||
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
||||
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
||||
for (int c = lo; c <= hi; c++) {
|
||||
int b = bytemap_[c];
|
||||
c = unbytemap_[b]; // last c in class
|
||||
uint32 act = node->action[b];
|
||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (Debug) {
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: conflict on byte "
|
||||
"%#x at state %d",
|
||||
c, *it);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
if (ip->cap() < kMaxCap)
|
||||
cond |= (1 << kCapShift) << ip->cap();
|
||||
goto QueueEmpty;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
cond |= ip->empty();
|
||||
goto QueueEmpty;
|
||||
|
||||
case kInstNop:
|
||||
QueueEmpty:
|
||||
// kInstCapture and kInstNop always proceed to ip->out().
|
||||
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
||||
// but as a conservative approximation we assume it always does.
|
||||
// We could be a little more precise by looking at what c
|
||||
// is, but that seems like overkill.
|
||||
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out())) {
|
||||
if (Debug) {
|
||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
|
||||
" %d -> %d\n",
|
||||
*it, ip->out());
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
stack[nstack].id = ip->out();
|
||||
stack[nstack++].cond = cond;
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (matched) {
|
||||
// (3) is violated
|
||||
if (Debug) {
|
||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
|
||||
" from %d\n", *it);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
matched = true;
|
||||
node->matchcond = cond;
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
||||
string dump = "prog dump:\n" + Dump() + "node dump\n";
|
||||
map<int, int> idmap;
|
||||
for (int i = 0; i < size; i++)
|
||||
if (nodebyid[i] != -1)
|
||||
idmap[nodebyid[i]] = i;
|
||||
|
||||
StringAppendF(&dump, "byte ranges:\n");
|
||||
int i = 0;
|
||||
for (int b = 0; b < bytemap_range_; b++) {
|
||||
int lo = i;
|
||||
while (bytemap_[i] == b)
|
||||
i++;
|
||||
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
|
||||
}
|
||||
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
if (nodeindex == -1)
|
||||
continue;
|
||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
||||
string s;
|
||||
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
||||
nodeindex, id, node->matchcond);
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
if ((node->action[i] & kImpossible) == kImpossible)
|
||||
continue;
|
||||
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
|
||||
i, node->action[i] & 0xFFFF,
|
||||
node->action[i] >> kIndexShift,
|
||||
idmap[node->action[i] >> kIndexShift]);
|
||||
}
|
||||
}
|
||||
LOG(ERROR) << dump;
|
||||
}
|
||||
|
||||
// Overallocated earlier; cut down to actual size.
|
||||
nodep = new uint8[nalloc*statesize];
|
||||
memmove(nodep, nodes, nalloc*statesize);
|
||||
delete[] nodes;
|
||||
nodes = nodep;
|
||||
|
||||
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
|
||||
onepass_nodes_ = nodes;
|
||||
onepass_statesize_ = statesize;
|
||||
dfa_mem_ -= nalloc*statesize;
|
||||
|
||||
delete[] stack;
|
||||
delete[] nodebyid;
|
||||
return true;
|
||||
|
||||
fail:
|
||||
delete[] stack;
|
||||
delete[] nodebyid;
|
||||
delete[] nodes;
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace re2
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,119 @@
|
|||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static URange16 code1[] = { /* \d */
|
||||
{ 0x30, 0x39 },
|
||||
};
|
||||
static URange16 code2[] = { /* \s */
|
||||
{ 0x9, 0xa },
|
||||
{ 0xc, 0xd },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static URange16 code3[] = { /* \w */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x5f, 0x5f },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
UGroup perl_groups[] = {
|
||||
{ "\\d", +1, code1, 1 },
|
||||
{ "\\D", -1, code1, 1 },
|
||||
{ "\\s", +1, code2, 3 },
|
||||
{ "\\S", -1, code2, 3 },
|
||||
{ "\\w", +1, code3, 4 },
|
||||
{ "\\W", -1, code3, 4 },
|
||||
};
|
||||
int num_perl_groups = 6;
|
||||
static URange16 code4[] = { /* [:alnum:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static URange16 code5[] = { /* [:alpha:] */
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static URange16 code6[] = { /* [:ascii:] */
|
||||
{ 0x0, 0x7f },
|
||||
};
|
||||
static URange16 code7[] = { /* [:blank:] */
|
||||
{ 0x9, 0x9 },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static URange16 code8[] = { /* [:cntrl:] */
|
||||
{ 0x0, 0x1f },
|
||||
{ 0x7f, 0x7f },
|
||||
};
|
||||
static URange16 code9[] = { /* [:digit:] */
|
||||
{ 0x30, 0x39 },
|
||||
};
|
||||
static URange16 code10[] = { /* [:graph:] */
|
||||
{ 0x21, 0x7e },
|
||||
};
|
||||
static URange16 code11[] = { /* [:lower:] */
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static URange16 code12[] = { /* [:print:] */
|
||||
{ 0x20, 0x7e },
|
||||
};
|
||||
static URange16 code13[] = { /* [:punct:] */
|
||||
{ 0x21, 0x2f },
|
||||
{ 0x3a, 0x40 },
|
||||
{ 0x5b, 0x60 },
|
||||
{ 0x7b, 0x7e },
|
||||
};
|
||||
static URange16 code14[] = { /* [:space:] */
|
||||
{ 0x9, 0xd },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static URange16 code15[] = { /* [:upper:] */
|
||||
{ 0x41, 0x5a },
|
||||
};
|
||||
static URange16 code16[] = { /* [:word:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x5f, 0x5f },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static URange16 code17[] = { /* [:xdigit:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x46 },
|
||||
{ 0x61, 0x66 },
|
||||
};
|
||||
UGroup posix_groups[] = {
|
||||
{ "[:alnum:]", +1, code4, 3 },
|
||||
{ "[:^alnum:]", -1, code4, 3 },
|
||||
{ "[:alpha:]", +1, code5, 2 },
|
||||
{ "[:^alpha:]", -1, code5, 2 },
|
||||
{ "[:ascii:]", +1, code6, 1 },
|
||||
{ "[:^ascii:]", -1, code6, 1 },
|
||||
{ "[:blank:]", +1, code7, 2 },
|
||||
{ "[:^blank:]", -1, code7, 2 },
|
||||
{ "[:cntrl:]", +1, code8, 2 },
|
||||
{ "[:^cntrl:]", -1, code8, 2 },
|
||||
{ "[:digit:]", +1, code9, 1 },
|
||||
{ "[:^digit:]", -1, code9, 1 },
|
||||
{ "[:graph:]", +1, code10, 1 },
|
||||
{ "[:^graph:]", -1, code10, 1 },
|
||||
{ "[:lower:]", +1, code11, 1 },
|
||||
{ "[:^lower:]", -1, code11, 1 },
|
||||
{ "[:print:]", +1, code12, 1 },
|
||||
{ "[:^print:]", -1, code12, 1 },
|
||||
{ "[:punct:]", +1, code13, 4 },
|
||||
{ "[:^punct:]", -1, code13, 4 },
|
||||
{ "[:space:]", +1, code14, 2 },
|
||||
{ "[:^space:]", -1, code14, 2 },
|
||||
{ "[:upper:]", +1, code15, 1 },
|
||||
{ "[:^upper:]", -1, code15, 1 },
|
||||
{ "[:word:]", +1, code16, 4 },
|
||||
{ "[:^word:]", -1, code16, 4 },
|
||||
{ "[:xdigit:]", +1, code17, 3 },
|
||||
{ "[:^xdigit:]", -1, code17, 3 },
|
||||
};
|
||||
int num_posix_groups = 28;
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,671 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/unicode_casefold.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const int Trace = false;
|
||||
|
||||
typedef set<string>::iterator SSIter;
|
||||
typedef set<string>::const_iterator ConstSSIter;
|
||||
|
||||
static int alloc_id = 100000; // Used for debugging.
|
||||
// Initializes a Prefilter, allocating subs_ as necessary.
|
||||
Prefilter::Prefilter(Op op) {
|
||||
op_ = op;
|
||||
subs_ = NULL;
|
||||
if (op_ == AND || op_ == OR)
|
||||
subs_ = new vector<Prefilter*>;
|
||||
|
||||
alloc_id_ = alloc_id++;
|
||||
VLOG(10) << "alloc_id: " << alloc_id_;
|
||||
}
|
||||
|
||||
// Destroys a Prefilter.
|
||||
Prefilter::~Prefilter() {
|
||||
VLOG(10) << "Deleted: " << alloc_id_;
|
||||
if (subs_) {
|
||||
for (int i = 0; i < subs_->size(); i++)
|
||||
delete (*subs_)[i];
|
||||
delete subs_;
|
||||
subs_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Simplify if the node is an empty Or or And.
|
||||
Prefilter* Prefilter::Simplify() {
|
||||
if (op_ != AND && op_ != OR) {
|
||||
return this;
|
||||
}
|
||||
|
||||
// Nothing left in the AND/OR.
|
||||
if (subs_->size() == 0) {
|
||||
if (op_ == AND)
|
||||
op_ = ALL; // AND of nothing is true
|
||||
else
|
||||
op_ = NONE; // OR of nothing is false
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Just one subnode: throw away wrapper.
|
||||
if (subs_->size() == 1) {
|
||||
Prefilter* a = (*subs_)[0];
|
||||
subs_->clear();
|
||||
delete this;
|
||||
return a->Simplify();
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Combines two Prefilters together to create an "op" (AND or OR).
|
||||
// The passed Prefilters will be part of the returned Prefilter or deleted.
|
||||
// Does lots of work to avoid creating unnecessarily complicated structures.
|
||||
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
|
||||
// If a, b can be rewritten as op, do so.
|
||||
a = a->Simplify();
|
||||
b = b->Simplify();
|
||||
|
||||
// Canonicalize: a->op <= b->op.
|
||||
if (a->op() > b->op()) {
|
||||
Prefilter* t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
|
||||
// Trivial cases.
|
||||
// ALL AND b = b
|
||||
// NONE OR b = b
|
||||
// ALL OR b = ALL
|
||||
// NONE AND b = NONE
|
||||
// Don't need to look at b, because of canonicalization above.
|
||||
// ALL and NONE are smallest opcodes.
|
||||
if (a->op() == ALL || a->op() == NONE) {
|
||||
if ((a->op() == ALL && op == AND) ||
|
||||
(a->op() == NONE && op == OR)) {
|
||||
delete a;
|
||||
return b;
|
||||
} else {
|
||||
delete b;
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
// If a and b match op, merge their contents.
|
||||
if (a->op() == op && b->op() == op) {
|
||||
for (int i = 0; i < b->subs()->size(); i++) {
|
||||
Prefilter* bb = (*b->subs())[i];
|
||||
a->subs()->push_back(bb);
|
||||
}
|
||||
b->subs()->clear();
|
||||
delete b;
|
||||
return a;
|
||||
}
|
||||
|
||||
// If a already has the same op as the op that is under construction
|
||||
// add in b (similarly if b already has the same op, add in a).
|
||||
if (b->op() == op) {
|
||||
Prefilter* t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
if (a->op() == op) {
|
||||
a->subs()->push_back(b);
|
||||
return a;
|
||||
}
|
||||
|
||||
// Otherwise just return the op.
|
||||
Prefilter* c = new Prefilter(op);
|
||||
c->subs()->push_back(a);
|
||||
c->subs()->push_back(b);
|
||||
return c;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(AND, a, b);
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(OR, a, b);
|
||||
}
|
||||
|
||||
static void SimplifyStringSet(set<string> *ss) {
|
||||
// Now make sure that the strings aren't redundant. For example, if
|
||||
// we know "ab" is a required string, then it doesn't help at all to
|
||||
// know that "abc" is also a required string, so delete "abc". This
|
||||
// is because, when we are performing a string search to filter
|
||||
// regexps, matching ab will already allow this regexp to be a
|
||||
// candidate for match, so further matching abc is redundant.
|
||||
|
||||
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
|
||||
SSIter j = i;
|
||||
++j;
|
||||
while (j != ss->end()) {
|
||||
// Increment j early so that we can erase the element it points to.
|
||||
SSIter old_j = j;
|
||||
++j;
|
||||
if (old_j->find(*i) != string::npos)
|
||||
ss->erase(old_j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::OrStrings(set<string>* ss) {
|
||||
SimplifyStringSet(ss);
|
||||
Prefilter* or_prefilter = NULL;
|
||||
if (!ss->empty()) {
|
||||
or_prefilter = new Prefilter(NONE);
|
||||
for (SSIter i = ss->begin(); i != ss->end(); ++i)
|
||||
or_prefilter = Or(or_prefilter, FromString(*i));
|
||||
}
|
||||
return or_prefilter;
|
||||
}
|
||||
|
||||
static Rune ToLowerRune(Rune r) {
|
||||
if (r < Runeself) {
|
||||
if ('A' <= r && r <= 'Z')
|
||||
r += 'a' - 'A';
|
||||
return r;
|
||||
}
|
||||
|
||||
CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
||||
if (f == NULL || r < f->lo)
|
||||
return r;
|
||||
return ApplyFold(f, r);
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::FromString(const string& str) {
|
||||
Prefilter* m = new Prefilter(Prefilter::ATOM);
|
||||
m->atom_ = str;
|
||||
return m;
|
||||
}
|
||||
|
||||
// Information about a regexp used during computation of Prefilter.
|
||||
// Can be thought of as information about the set of strings matching
|
||||
// the given regular expression.
|
||||
class Prefilter::Info {
|
||||
public:
|
||||
Info();
|
||||
~Info();
|
||||
|
||||
// More constructors. They delete their Info* arguments.
|
||||
static Info* Alt(Info* a, Info* b);
|
||||
static Info* Concat(Info* a, Info* b);
|
||||
static Info* And(Info* a, Info* b);
|
||||
static Info* Star(Info* a);
|
||||
static Info* Plus(Info* a);
|
||||
static Info* Quest(Info* a);
|
||||
static Info* EmptyString();
|
||||
static Info* NoMatch();
|
||||
static Info* AnyChar();
|
||||
static Info* CClass(CharClass* cc);
|
||||
static Info* Literal(Rune r);
|
||||
static Info* AnyMatch();
|
||||
|
||||
// Format Info as a string.
|
||||
string ToString();
|
||||
|
||||
// Caller takes ownership of the Prefilter.
|
||||
Prefilter* TakeMatch();
|
||||
|
||||
set<string>& exact() { return exact_; }
|
||||
|
||||
bool is_exact() const { return is_exact_; }
|
||||
|
||||
class Walker;
|
||||
|
||||
private:
|
||||
set<string> exact_;
|
||||
|
||||
// When is_exact_ is true, the strings that match
|
||||
// are placed in exact_. When it is no longer an exact
|
||||
// set of strings that match this RE, then is_exact_
|
||||
// is false and the match_ contains the required match
|
||||
// criteria.
|
||||
bool is_exact_;
|
||||
|
||||
// Accumulated Prefilter query that any
|
||||
// match for this regexp is guaranteed to match.
|
||||
Prefilter* match_;
|
||||
};
|
||||
|
||||
|
||||
Prefilter::Info::Info()
|
||||
: is_exact_(false),
|
||||
match_(NULL) {
|
||||
}
|
||||
|
||||
Prefilter::Info::~Info() {
|
||||
delete match_;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::Info::TakeMatch() {
|
||||
if (is_exact_) {
|
||||
match_ = Prefilter::OrStrings(&exact_);
|
||||
is_exact_ = false;
|
||||
}
|
||||
Prefilter* m = match_;
|
||||
match_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
// Format a Info in string form.
|
||||
string Prefilter::Info::ToString() {
|
||||
if (this == NULL) {
|
||||
// Sometimes when iterating on children of a node,
|
||||
// some children might have NULL Info. Adding
|
||||
// the check here for NULL to take care of cases where
|
||||
// the caller is not checking.
|
||||
return "";
|
||||
}
|
||||
|
||||
if (is_exact_) {
|
||||
int n = 0;
|
||||
string s;
|
||||
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
|
||||
if (n++ > 0)
|
||||
s += ",";
|
||||
s += *i;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
if (match_)
|
||||
return match_->DebugString();
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
// Add the strings from src to dst.
|
||||
static void CopyIn(const set<string>& src, set<string>* dst) {
|
||||
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
|
||||
dst->insert(*i);
|
||||
}
|
||||
|
||||
// Add the cross-product of a and b to dst.
|
||||
// (For each string i in a and j in b, add i+j.)
|
||||
static void CrossProduct(const set<string>& a,
|
||||
const set<string>& b,
|
||||
set<string>* dst) {
|
||||
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
||||
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
||||
dst->insert(*i + *j);
|
||||
}
|
||||
|
||||
// Concats a and b. Requires that both are exact sets.
|
||||
// Forms an exact set that is a crossproduct of a and b.
|
||||
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
|
||||
if (a == NULL)
|
||||
return b;
|
||||
DCHECK(a->is_exact_);
|
||||
DCHECK(b && b->is_exact_);
|
||||
Info *ab = new Info();
|
||||
|
||||
CrossProduct(a->exact_, b->exact_, &ab->exact_);
|
||||
ab->is_exact_ = true;
|
||||
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs an inexact Info for ab given a and b.
|
||||
// Used only when a or b is not exact or when the
|
||||
// exact cross product is likely to be too big.
|
||||
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
|
||||
if (a == NULL)
|
||||
return b;
|
||||
if (b == NULL)
|
||||
return a;
|
||||
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
|
||||
ab->is_exact_ = false;
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a|b given a and b.
|
||||
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
|
||||
Info *ab = new Info();
|
||||
|
||||
if (a->is_exact_ && b->is_exact_) {
|
||||
CopyIn(a->exact_, &ab->exact_);
|
||||
CopyIn(b->exact_, &ab->exact_);
|
||||
ab->is_exact_ = true;
|
||||
} else {
|
||||
// Either a or b has is_exact_ = false. If the other
|
||||
// one has is_exact_ = true, we move it to match_ and
|
||||
// then create a OR of a,b. The resulting Info has
|
||||
// is_exact_ = false.
|
||||
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
|
||||
ab->is_exact_ = false;
|
||||
}
|
||||
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a? given a.
|
||||
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->is_exact_ = false;
|
||||
ab->match_ = new Prefilter(ALL);
|
||||
delete a;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a* given a.
|
||||
// Same as a? -- not much to do.
|
||||
Prefilter::Info* Prefilter::Info::Star(Info *a) {
|
||||
return Quest(a);
|
||||
}
|
||||
|
||||
// Constructs Info for a+ given a. If a was exact set, it isn't
|
||||
// anymore.
|
||||
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->match_ = a->TakeMatch();
|
||||
ab->is_exact_ = false;
|
||||
|
||||
delete a;
|
||||
return ab;
|
||||
}
|
||||
|
||||
static string RuneToString(Rune r) {
|
||||
char buf[UTFmax];
|
||||
int n = runetochar(buf, &r);
|
||||
return string(buf, n);
|
||||
}
|
||||
|
||||
// Constructs Info for literal rune.
|
||||
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
|
||||
Info* info = new Info();
|
||||
info->exact_.insert(RuneToString(ToLowerRune(r)));
|
||||
info->is_exact_ = true;
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Info for dot (any character).
|
||||
Prefilter::Info* Prefilter::Info::AnyChar() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(ALL);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for no possible match.
|
||||
Prefilter::Info* Prefilter::Info::NoMatch() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(NONE);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for any possible match.
|
||||
// This Prefilter::Info is valid for any regular expression,
|
||||
// since it makes no assertions whatsoever about the
|
||||
// strings being matched.
|
||||
Prefilter::Info* Prefilter::Info::AnyMatch() {
|
||||
Prefilter::Info *info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(ALL);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for just the empty string.
|
||||
Prefilter::Info* Prefilter::Info::EmptyString() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->is_exact_ = true;
|
||||
info->exact_.insert("");
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for a character class.
|
||||
typedef CharClass::iterator CCIter;
|
||||
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc) {
|
||||
if (Trace) {
|
||||
VLOG(0) << "CharClassInfo:";
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
VLOG(0) << " " << i->lo << "-" << i->hi;
|
||||
}
|
||||
|
||||
// If the class is too large, it's okay to overestimate.
|
||||
if (cc->size() > 10)
|
||||
return AnyChar();
|
||||
|
||||
Prefilter::Info *a = new Prefilter::Info();
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
for (Rune r = i->lo; r <= i->hi; r++)
|
||||
a->exact_.insert(RuneToString(ToLowerRune(r)));
|
||||
|
||||
a->is_exact_ = true;
|
||||
|
||||
if (Trace) {
|
||||
VLOG(0) << " = " << a->ToString();
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
|
||||
public:
|
||||
Walker() {}
|
||||
|
||||
virtual Info* PostVisit(
|
||||
Regexp* re, Info* parent_arg,
|
||||
Info* pre_arg,
|
||||
Info** child_args, int nchild_args);
|
||||
|
||||
virtual Info* ShortVisit(
|
||||
Regexp* re,
|
||||
Info* parent_arg);
|
||||
|
||||
private:
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Walker);
|
||||
};
|
||||
|
||||
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
||||
if (Trace) {
|
||||
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
|
||||
}
|
||||
Prefilter::Info::Walker w;
|
||||
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
||||
|
||||
if (w.stopped_early()) {
|
||||
delete info;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
|
||||
Regexp* re, Prefilter::Info* parent_arg) {
|
||||
return AnyMatch();
|
||||
}
|
||||
|
||||
// Constructs the Prefilter::Info for the given regular expression.
|
||||
// Assumes re is simplified.
|
||||
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
||||
Regexp* re, Prefilter::Info* parent_arg,
|
||||
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
|
||||
int nchild_args) {
|
||||
Prefilter::Info *info;
|
||||
switch (re->op()) {
|
||||
default:
|
||||
case kRegexpRepeat:
|
||||
LOG(DFATAL) << "Bad regexp op " << re->op();
|
||||
info = EmptyString();
|
||||
break;
|
||||
|
||||
case kRegexpNoMatch:
|
||||
info = NoMatch();
|
||||
break;
|
||||
|
||||
// These ops match the empty string:
|
||||
case kRegexpEmptyMatch: // anywhere
|
||||
case kRegexpBeginLine: // at beginning of line
|
||||
case kRegexpEndLine: // at end of line
|
||||
case kRegexpBeginText: // at beginning of text
|
||||
case kRegexpEndText: // at end of text
|
||||
case kRegexpWordBoundary: // at word boundary
|
||||
case kRegexpNoWordBoundary: // not at word boundary
|
||||
info = EmptyString();
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
info = Literal(re->rune());
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
if (re->nrunes() == 0) {
|
||||
info = NoMatch();
|
||||
break;
|
||||
}
|
||||
info = Literal(re->runes()[0]);
|
||||
for (int i = 1; i < re->nrunes(); i++)
|
||||
info = Concat(info, Literal(re->runes()[i]));
|
||||
break;
|
||||
|
||||
case kRegexpConcat: {
|
||||
// Accumulate in info.
|
||||
// Exact is concat of recent contiguous exact nodes.
|
||||
info = NULL;
|
||||
Info* exact = NULL;
|
||||
for (int i = 0; i < nchild_args; i++) {
|
||||
Info* ci = child_args[i]; // child info
|
||||
if (!ci->is_exact() ||
|
||||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
|
||||
// Exact run is over.
|
||||
info = And(info, exact);
|
||||
exact = NULL;
|
||||
// Add this child's info.
|
||||
info = And(info, ci);
|
||||
} else {
|
||||
// Append to exact run.
|
||||
exact = Concat(exact, ci);
|
||||
}
|
||||
}
|
||||
info = And(info, exact);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpAlternate:
|
||||
info = child_args[0];
|
||||
for (int i = 1; i < nchild_args; i++)
|
||||
info = Alt(info, child_args[i]);
|
||||
VLOG(10) << "Alt: " << info->ToString();
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
info = Star(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
info = Quest(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
info = Plus(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpAnyChar:
|
||||
// Claim nothing, except that it's not empty.
|
||||
info = AnyChar();
|
||||
break;
|
||||
|
||||
case kRegexpCharClass:
|
||||
info = CClass(re->cc());
|
||||
break;
|
||||
|
||||
case kRegexpCapture:
|
||||
// These don't affect the set of matching strings.
|
||||
info = child_args[0];
|
||||
break;
|
||||
}
|
||||
|
||||
if (Trace) {
|
||||
VLOG(0) << "BuildInfo " << re->ToString()
|
||||
<< ": " << info->ToString();
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
Prefilter* Prefilter::FromRegexp(Regexp* re) {
|
||||
if (re == NULL)
|
||||
return NULL;
|
||||
|
||||
Regexp* simple = re->Simplify();
|
||||
Prefilter::Info *info = BuildInfo(simple);
|
||||
|
||||
simple->Decref();
|
||||
if (info == NULL)
|
||||
return NULL;
|
||||
|
||||
Prefilter* m = info->TakeMatch();
|
||||
|
||||
delete info;
|
||||
return m;
|
||||
}
|
||||
|
||||
string Prefilter::DebugString() const {
|
||||
if (this == NULL)
|
||||
return "<nil>";
|
||||
|
||||
switch (op_) {
|
||||
default:
|
||||
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
|
||||
return StringPrintf("op%d", op_);
|
||||
case NONE:
|
||||
return "*no-matches*";
|
||||
case ATOM:
|
||||
return atom_;
|
||||
case ALL:
|
||||
return "";
|
||||
case AND: {
|
||||
string s = "";
|
||||
for (int i = 0; i < subs_->size(); i++) {
|
||||
if (i > 0)
|
||||
s += " ";
|
||||
s += (*subs_)[i]->DebugString();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
case OR: {
|
||||
string s = "(";
|
||||
for (int i = 0; i < subs_->size(); i++) {
|
||||
if (i > 0)
|
||||
s += "|";
|
||||
s += (*subs_)[i]->DebugString();
|
||||
}
|
||||
s += ")";
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::FromRE2(const RE2* re2) {
|
||||
if (re2 == NULL)
|
||||
return NULL;
|
||||
|
||||
Regexp* regexp = re2->Regexp();
|
||||
if (regexp == NULL)
|
||||
return NULL;
|
||||
|
||||
return FromRegexp(regexp);
|
||||
}
|
||||
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,105 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Prefilter is the class used to extract string guards from regexps.
|
||||
// Rather than using Prefilter class directly, use FilteredRE2.
|
||||
// See filtered_re2.h
|
||||
|
||||
#ifndef RE2_PREFILTER_H_
|
||||
#define RE2_PREFILTER_H_
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class RE2;
|
||||
|
||||
class Regexp;
|
||||
|
||||
class Prefilter {
|
||||
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
|
||||
public:
|
||||
enum Op {
|
||||
ALL = 0, // Everything matches
|
||||
NONE, // Nothing matches
|
||||
ATOM, // The string atom() must match
|
||||
AND, // All in subs() must match
|
||||
OR, // One of subs() must match
|
||||
};
|
||||
|
||||
explicit Prefilter(Op op);
|
||||
~Prefilter();
|
||||
|
||||
Op op() { return op_; }
|
||||
const string& atom() const { return atom_; }
|
||||
void set_unique_id(int id) { unique_id_ = id; }
|
||||
int unique_id() const { return unique_id_; }
|
||||
|
||||
// The children of the Prefilter node.
|
||||
vector<Prefilter*>* subs() {
|
||||
CHECK(op_ == AND || op_ == OR);
|
||||
return subs_;
|
||||
}
|
||||
|
||||
// Set the children vector. Prefilter takes ownership of subs and
|
||||
// subs_ will be deleted when Prefilter is deleted.
|
||||
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
|
||||
|
||||
// Given a RE2, return a Prefilter. The caller takes ownership of
|
||||
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
||||
// cannot be formed.
|
||||
static Prefilter* FromRE2(const RE2* re2);
|
||||
|
||||
// Returns a readable debug string of the prefilter.
|
||||
string DebugString() const;
|
||||
|
||||
private:
|
||||
class Info;
|
||||
|
||||
// Combines two prefilters together to create an AND. The passed
|
||||
// Prefilters will be part of the returned Prefilter or deleted.
|
||||
static Prefilter* And(Prefilter* a, Prefilter* b);
|
||||
|
||||
// Combines two prefilters together to create an OR. The passed
|
||||
// Prefilters will be part of the returned Prefilter or deleted.
|
||||
static Prefilter* Or(Prefilter* a, Prefilter* b);
|
||||
|
||||
// Generalized And/Or
|
||||
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
|
||||
|
||||
static Prefilter* FromRegexp(Regexp* a);
|
||||
|
||||
static Prefilter* FromString(const string& str);
|
||||
|
||||
static Prefilter* OrStrings(set<string>* ss);
|
||||
|
||||
static Info* BuildInfo(Regexp* re);
|
||||
|
||||
Prefilter* Simplify();
|
||||
|
||||
// Kind of Prefilter.
|
||||
Op op_;
|
||||
|
||||
// Sub-matches for AND or OR Prefilter.
|
||||
vector<Prefilter*>* subs_;
|
||||
|
||||
// Actual string to match in leaf node.
|
||||
string atom_;
|
||||
|
||||
// If different prefilters have the same string atom, or if they are
|
||||
// structurally the same (e.g., OR of same atom strings) they are
|
||||
// considered the same unique nodes. This is the id for each unique
|
||||
// node. This field is populated with a unique id for every node,
|
||||
// and -1 for duplicate nodes.
|
||||
int unique_id_;
|
||||
|
||||
// Used for debugging, helps in tracking memory leaks.
|
||||
int alloc_id_;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PREFILTER_H_
|
|
@ -0,0 +1,398 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
DEFINE_int32(filtered_re2_min_atom_len,
|
||||
3,
|
||||
"Strings less than this length are not stored as atoms");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
PrefilterTree::PrefilterTree()
|
||||
: compiled_(false) {
|
||||
}
|
||||
|
||||
PrefilterTree::~PrefilterTree() {
|
||||
for (int i = 0; i < prefilter_vec_.size(); i++)
|
||||
delete prefilter_vec_[i];
|
||||
|
||||
for (int i = 0; i < entries_.size(); i++)
|
||||
delete entries_[i].parents;
|
||||
}
|
||||
|
||||
// Functions used for adding and Compiling prefilters to the
|
||||
// PrefilterTree.
|
||||
static bool KeepPart(Prefilter* prefilter, int level) {
|
||||
if (prefilter == NULL)
|
||||
return false;
|
||||
|
||||
switch (prefilter->op()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected op in KeepPart: "
|
||||
<< prefilter->op();
|
||||
return false;
|
||||
|
||||
case Prefilter::ALL:
|
||||
return false;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
return prefilter->atom().size() >=
|
||||
FLAGS_filtered_re2_min_atom_len;
|
||||
|
||||
case Prefilter::AND: {
|
||||
int j = 0;
|
||||
vector<Prefilter*>* subs = prefilter->subs();
|
||||
for (int i = 0; i < subs->size(); i++)
|
||||
if (KeepPart((*subs)[i], level + 1))
|
||||
(*subs)[j++] = (*subs)[i];
|
||||
else
|
||||
delete (*subs)[i];
|
||||
|
||||
subs->resize(j);
|
||||
return j > 0;
|
||||
}
|
||||
|
||||
case Prefilter::OR:
|
||||
for (int i = 0; i < prefilter->subs()->size(); i++)
|
||||
if (!KeepPart((*prefilter->subs())[i], level + 1))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void PrefilterTree::Add(Prefilter *f) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "Add after Compile.";
|
||||
return;
|
||||
}
|
||||
if (f != NULL && !KeepPart(f, 0)) {
|
||||
delete f;
|
||||
f = NULL;
|
||||
}
|
||||
|
||||
prefilter_vec_.push_back(f);
|
||||
}
|
||||
|
||||
void PrefilterTree::Compile(vector<string>* atom_vec) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "Compile after Compile.";
|
||||
return;
|
||||
}
|
||||
|
||||
// We do this check to support some legacy uses of
|
||||
// PrefilterTree that call Compile before adding any regexps,
|
||||
// and expect Compile not to have effect.
|
||||
if (prefilter_vec_.empty())
|
||||
return;
|
||||
|
||||
compiled_ = true;
|
||||
|
||||
AssignUniqueIds(atom_vec);
|
||||
|
||||
// Identify nodes that are too common among prefilters and are
|
||||
// triggering too many parents. Then get rid of them if possible.
|
||||
// Note that getting rid of a prefilter node simply means they are
|
||||
// no longer necessary for their parent to trigger; that is, we do
|
||||
// not miss out on any regexps triggering by getting rid of a
|
||||
// prefilter node.
|
||||
for (int i = 0; i < entries_.size(); i++) {
|
||||
IntMap* parents = entries_[i].parents;
|
||||
if (parents->size() > 8) {
|
||||
// This one triggers too many things. If all the parents are AND
|
||||
// nodes and have other things guarding them, then get rid of
|
||||
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
||||
// make it a function of total number of nodes?
|
||||
bool have_other_guard = true;
|
||||
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
||||
have_other_guard = have_other_guard &&
|
||||
(entries_[it->index()].propagate_up_at_count > 1);
|
||||
|
||||
if (have_other_guard) {
|
||||
for (IntMap::iterator it = parents->begin();
|
||||
it != parents->end(); ++it)
|
||||
entries_[it->index()].propagate_up_at_count -= 1;
|
||||
|
||||
parents->clear(); // Forget the parents
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PrintDebugInfo();
|
||||
}
|
||||
|
||||
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
|
||||
string node_string = NodeString(node);
|
||||
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
|
||||
if (iter == node_map_.end())
|
||||
return NULL;
|
||||
return (*iter).second;
|
||||
}
|
||||
|
||||
static string Itoa(int n) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof buf, "%d", n);
|
||||
return string(buf);
|
||||
}
|
||||
|
||||
string PrefilterTree::NodeString(Prefilter* node) const {
|
||||
// Adding the operation disambiguates AND/OR/atom nodes.
|
||||
string s = Itoa(node->op()) + ":";
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
s += node->atom();
|
||||
} else {
|
||||
for (int i = 0; i < node->subs()->size() ; i++) {
|
||||
if (i > 0)
|
||||
s += ',';
|
||||
s += Itoa((*node->subs())[i]->unique_id());
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
atom_vec->clear();
|
||||
|
||||
// Build vector of all filter nodes, sorted topologically
|
||||
// from top to bottom in v.
|
||||
vector<Prefilter*> v;
|
||||
|
||||
// Add the top level nodes of each regexp prefilter.
|
||||
for (int i = 0; i < prefilter_vec_.size(); i++) {
|
||||
Prefilter* f = prefilter_vec_[i];
|
||||
if (f == NULL)
|
||||
unfiltered_.push_back(i);
|
||||
|
||||
// We push NULL also on to v, so that we maintain the
|
||||
// mapping of index==regexpid for level=0 prefilter nodes.
|
||||
v.push_back(f);
|
||||
}
|
||||
|
||||
// Now add all the descendant nodes.
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
Prefilter* f = v[i];
|
||||
if (f == NULL)
|
||||
continue;
|
||||
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
||||
const vector<Prefilter*>& subs = *f->subs();
|
||||
for (int j = 0; j < subs.size(); j++)
|
||||
v.push_back(subs[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Identify unique nodes.
|
||||
int unique_id = 0;
|
||||
for (int i = v.size() - 1; i >= 0; i--) {
|
||||
Prefilter *node = v[i];
|
||||
if (node == NULL)
|
||||
continue;
|
||||
node->set_unique_id(-1);
|
||||
Prefilter* canonical = CanonicalNode(node);
|
||||
if (canonical == NULL) {
|
||||
// Any further nodes that have the same node string
|
||||
// will find this node as the canonical node.
|
||||
node_map_[NodeString(node)] = node;
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
atom_vec->push_back(node->atom());
|
||||
atom_index_to_id_.push_back(unique_id);
|
||||
}
|
||||
node->set_unique_id(unique_id++);
|
||||
} else {
|
||||
node->set_unique_id(canonical->unique_id());
|
||||
}
|
||||
}
|
||||
entries_.resize(node_map_.size());
|
||||
|
||||
// Create parent IntMap for the entries.
|
||||
for (int i = v.size() - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
|
||||
if (CanonicalNode(prefilter) != prefilter)
|
||||
continue;
|
||||
|
||||
Entry* entry = &entries_[prefilter->unique_id()];
|
||||
entry->parents = new IntMap(node_map_.size());
|
||||
}
|
||||
|
||||
// Fill the entries.
|
||||
for (int i = v.size() - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
|
||||
if (CanonicalNode(prefilter) != prefilter)
|
||||
continue;
|
||||
|
||||
Entry* entry = &entries_[prefilter->unique_id()];
|
||||
|
||||
switch (prefilter->op()) {
|
||||
default:
|
||||
case Prefilter::ALL:
|
||||
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
|
||||
return;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
entry->propagate_up_at_count = 1;
|
||||
break;
|
||||
|
||||
case Prefilter::OR:
|
||||
case Prefilter::AND: {
|
||||
IntMap uniq_child(node_map_.size());
|
||||
for (int j = 0; j < prefilter->subs()->size() ; j++) {
|
||||
Prefilter* child = (*prefilter->subs())[j];
|
||||
Prefilter* canonical = CanonicalNode(child);
|
||||
if (canonical == NULL) {
|
||||
LOG(DFATAL) << "Null canonical node";
|
||||
return;
|
||||
}
|
||||
int child_id = canonical->unique_id();
|
||||
if (!uniq_child.has_index(child_id))
|
||||
uniq_child.set_new(child_id, 1);
|
||||
// To the child, we want to add to parent indices.
|
||||
Entry* child_entry = &entries_[child_id];
|
||||
if (!child_entry->parents->has_index(prefilter->unique_id()))
|
||||
child_entry->parents->set_new(prefilter->unique_id(), 1);
|
||||
}
|
||||
entry->propagate_up_at_count =
|
||||
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For top level nodes, populate regexp id.
|
||||
for (int i = 0; i < prefilter_vec_.size(); i++) {
|
||||
if (prefilter_vec_[i] == NULL)
|
||||
continue;
|
||||
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
|
||||
DCHECK_LE(0, id);
|
||||
Entry* entry = &entries_[id];
|
||||
entry->regexps.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for triggering during search.
|
||||
void PrefilterTree::RegexpsGivenStrings(
|
||||
const vector<int>& matched_atoms,
|
||||
vector<int>* regexps) const {
|
||||
regexps->clear();
|
||||
if (!compiled_) {
|
||||
LOG(WARNING) << "Compile() not called";
|
||||
for (int i = 0; i < prefilter_vec_.size(); ++i)
|
||||
regexps->push_back(i);
|
||||
} else {
|
||||
if (!prefilter_vec_.empty()) {
|
||||
IntMap regexps_map(prefilter_vec_.size());
|
||||
vector<int> matched_atom_ids;
|
||||
for (int j = 0; j < matched_atoms.size(); j++) {
|
||||
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
||||
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
|
||||
}
|
||||
PropagateMatch(matched_atom_ids, ®exps_map);
|
||||
for (IntMap::iterator it = regexps_map.begin();
|
||||
it != regexps_map.end();
|
||||
++it)
|
||||
regexps->push_back(it->index());
|
||||
|
||||
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
||||
}
|
||||
}
|
||||
sort(regexps->begin(), regexps->end());
|
||||
}
|
||||
|
||||
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
||||
IntMap* regexps) const {
|
||||
IntMap count(entries_.size());
|
||||
IntMap work(entries_.size());
|
||||
for (int i = 0; i < atom_ids.size(); i++)
|
||||
work.set(atom_ids[i], 1);
|
||||
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
||||
const Entry& entry = entries_[it->index()];
|
||||
VLOG(10) << "Processing: " << it->index();
|
||||
// Record regexps triggered.
|
||||
for (int i = 0; i < entry.regexps.size(); i++) {
|
||||
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
|
||||
regexps->set(entry.regexps[i], 1);
|
||||
}
|
||||
int c;
|
||||
// Pass trigger up to parents.
|
||||
for (IntMap::iterator it = entry.parents->begin();
|
||||
it != entry.parents->end();
|
||||
++it) {
|
||||
int j = it->index();
|
||||
const Entry& parent = entries_[j];
|
||||
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
|
||||
// Delay until all the children have succeeded.
|
||||
if (parent.propagate_up_at_count > 1) {
|
||||
if (count.has_index(j)) {
|
||||
c = count.get_existing(j) + 1;
|
||||
count.set_existing(j, c);
|
||||
} else {
|
||||
c = 1;
|
||||
count.set_new(j, c);
|
||||
}
|
||||
if (c < parent.propagate_up_at_count)
|
||||
continue;
|
||||
}
|
||||
VLOG(10) << "Triggering: " << j;
|
||||
// Trigger the parent.
|
||||
work.set(j, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Debugging help.
|
||||
void PrefilterTree::PrintPrefilter(int regexpid) {
|
||||
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
|
||||
}
|
||||
|
||||
void PrefilterTree::PrintDebugInfo() {
|
||||
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
|
||||
VLOG(10) << "#Unique Nodes: " << entries_.size();
|
||||
|
||||
for (int i = 0; i < entries_.size(); ++i) {
|
||||
IntMap* parents = entries_[i].parents;
|
||||
const vector<int>& regexps = entries_[i].regexps;
|
||||
VLOG(10) << "EntryId: " << i
|
||||
<< " N: " << parents->size() << " R: " << regexps.size();
|
||||
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
||||
VLOG(10) << it->index();
|
||||
}
|
||||
VLOG(10) << "Map:";
|
||||
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
|
||||
iter != node_map_.end(); ++iter)
|
||||
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
|
||||
<< " Str: " << (*iter).first;
|
||||
}
|
||||
|
||||
string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
||||
string node_string = "";
|
||||
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
DCHECK(!node->atom().empty());
|
||||
node_string += node->atom();
|
||||
} else {
|
||||
// Adding the operation disambiguates AND and OR nodes.
|
||||
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
||||
node_string += "(";
|
||||
for (int i = 0; i < node->subs()->size() ; i++) {
|
||||
if (i > 0)
|
||||
node_string += ',';
|
||||
node_string += Itoa((*node->subs())[i]->unique_id());
|
||||
node_string += ":";
|
||||
node_string += DebugNodeString((*node->subs())[i]);
|
||||
}
|
||||
node_string += ")";
|
||||
}
|
||||
return node_string;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,130 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The PrefilterTree class is used to form an AND-OR tree of strings
|
||||
// that would trigger each regexp. The 'prefilter' of each regexp is
|
||||
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
||||
// the unique strings across the prefilters. During search, by using
|
||||
// matches from a string matching engine, PrefilterTree deduces the
|
||||
// set of regexps that are to be triggered. The 'string matching
|
||||
// engine' itself is outside of this class, and the caller can use any
|
||||
// favorite engine. PrefilterTree provides a set of strings (called
|
||||
// atoms) that the user of this class should use to do the string
|
||||
// matching.
|
||||
//
|
||||
#ifndef RE2_PREFILTER_TREE_H_
|
||||
#define RE2_PREFILTER_TREE_H_
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_array.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
typedef SparseArray<int> IntMap;
|
||||
|
||||
class Prefilter;
|
||||
|
||||
class PrefilterTree {
|
||||
public:
|
||||
PrefilterTree();
|
||||
~PrefilterTree();
|
||||
|
||||
// Adds the prefilter for the next regexp. Note that we assume that
|
||||
// Add called sequentially for all regexps. All Add calls
|
||||
// must precede Compile.
|
||||
void Add(Prefilter* prefilter);
|
||||
|
||||
// The Compile returns a vector of string in atom_vec.
|
||||
// Call this after all the prefilters are added through Add.
|
||||
// No calls to Add after Compile are allowed.
|
||||
// The caller should use the returned set of strings to do string matching.
|
||||
// Each time a string matches, the corresponding index then has to be
|
||||
// and passed to RegexpsGivenStrings below.
|
||||
void Compile(vector<string>* atom_vec);
|
||||
|
||||
// Given the indices of the atoms that matched, returns the indexes
|
||||
// of regexps that should be searched. The matched_atoms should
|
||||
// contain all the ids of string atoms that were found to match the
|
||||
// content. The caller can use any string match engine to perform
|
||||
// this function. This function is thread safe.
|
||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* regexps) const;
|
||||
|
||||
// Print debug prefilter. Also prints unique ids associated with
|
||||
// nodes of the prefilter of the regexp.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
|
||||
// Each unique node has a corresponding Entry that helps in
|
||||
// passing the matching trigger information along the tree.
|
||||
struct Entry {
|
||||
public:
|
||||
// How many children should match before this node triggers the
|
||||
// parent. For an atom and an OR node, this is 1 and for an AND
|
||||
// node, it is the number of unique children.
|
||||
int propagate_up_at_count;
|
||||
|
||||
// When this node is ready to trigger the parent, what are the indices
|
||||
// of the parent nodes to trigger. The reason there may be more than
|
||||
// one is because of sharing. For example (abc | def) and (xyz | def)
|
||||
// are two different nodes, but they share the atom 'def'. So when
|
||||
// 'def' matches, it triggers two parents, corresponding to the two
|
||||
// different OR nodes.
|
||||
IntMap* parents;
|
||||
|
||||
// When this node is ready to trigger the parent, what are the
|
||||
// regexps that are triggered.
|
||||
vector<int> regexps;
|
||||
};
|
||||
|
||||
private:
|
||||
// This function assigns unique ids to various parts of the
|
||||
// prefilter, by looking at if these nodes are already in the
|
||||
// PrefilterTree.
|
||||
void AssignUniqueIds(vector<string>* atom_vec);
|
||||
|
||||
// Given the matching atoms, find the regexps to be triggered.
|
||||
void PropagateMatch(const vector<int>& atom_ids,
|
||||
IntMap* regexps) const;
|
||||
|
||||
// Returns the prefilter node that has the same NodeString as this
|
||||
// node. For the canonical node, returns node.
|
||||
Prefilter* CanonicalNode(Prefilter* node);
|
||||
|
||||
// A string that uniquely identifies the node. Assumes that the
|
||||
// children of node has already been assigned unique ids.
|
||||
string NodeString(Prefilter* node) const;
|
||||
|
||||
// Recursively constructs a readable prefilter string.
|
||||
string DebugNodeString(Prefilter* node) const;
|
||||
|
||||
// Used for debugging.
|
||||
void PrintDebugInfo();
|
||||
|
||||
// These are all the nodes formed by Compile. Essentially, there is
|
||||
// one node for each unique atom and each unique AND/OR node.
|
||||
vector<Entry> entries_;
|
||||
|
||||
// Map node string to canonical Prefilter node.
|
||||
map<string, Prefilter*> node_map_;
|
||||
|
||||
// indices of regexps that always pass through the filter (since we
|
||||
// found no required literals in these regexps).
|
||||
vector<int> unfiltered_;
|
||||
|
||||
// vector of Prefilter for all regexps.
|
||||
vector<Prefilter*> prefilter_vec_;
|
||||
|
||||
// Atom index in returned strings to entry id mapping.
|
||||
vector<int> atom_index_to_id_;
|
||||
|
||||
// Has the prefilter tree been compiled.
|
||||
bool compiled_;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // RE2_PREFILTER_TREE_H_
|
|
@ -0,0 +1,341 @@
|
|||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Compiled regular expression representation.
|
||||
// Tested by compile_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Constructors per Inst opcode
|
||||
|
||||
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstAlt);
|
||||
out1_ = out1;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstByteRange);
|
||||
lo_ = lo & 0xFF;
|
||||
hi_ = hi & 0xFF;
|
||||
foldcase_ = foldcase;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitCapture(int cap, uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstCapture);
|
||||
cap_ = cap;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstEmptyWidth);
|
||||
empty_ = empty;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitMatch(int32 id) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstMatch);
|
||||
match_id_ = id;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitNop(uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstNop);
|
||||
}
|
||||
|
||||
void Prog::Inst::InitFail() {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstFail);
|
||||
}
|
||||
|
||||
string Prog::Inst::Dump() {
|
||||
switch (opcode()) {
|
||||
default:
|
||||
return StringPrintf("opcode %d", static_cast<int>(opcode()));
|
||||
|
||||
case kInstAlt:
|
||||
return StringPrintf("alt -> %d | %d", out(), out1_);
|
||||
|
||||
case kInstAltMatch:
|
||||
return StringPrintf("altmatch -> %d | %d", out(), out1_);
|
||||
|
||||
case kInstByteRange:
|
||||
return StringPrintf("byte%s [%02x-%02x] -> %d",
|
||||
foldcase_ ? "/i" : "",
|
||||
lo_, hi_, out());
|
||||
|
||||
case kInstCapture:
|
||||
return StringPrintf("capture %d -> %d", cap_, out());
|
||||
|
||||
case kInstEmptyWidth:
|
||||
return StringPrintf("emptywidth %#x -> %d",
|
||||
static_cast<int>(empty_), out());
|
||||
|
||||
case kInstMatch:
|
||||
return StringPrintf("match! %d", match_id());
|
||||
|
||||
case kInstNop:
|
||||
return StringPrintf("nop -> %d", out());
|
||||
|
||||
case kInstFail:
|
||||
return StringPrintf("fail");
|
||||
}
|
||||
}
|
||||
|
||||
Prog::Prog()
|
||||
: anchor_start_(false),
|
||||
anchor_end_(false),
|
||||
reversed_(false),
|
||||
did_onepass_(false),
|
||||
start_(0),
|
||||
start_unanchored_(0),
|
||||
size_(0),
|
||||
byte_inst_count_(0),
|
||||
bytemap_range_(0),
|
||||
flags_(0),
|
||||
onepass_statesize_(0),
|
||||
inst_(NULL),
|
||||
dfa_first_(NULL),
|
||||
dfa_longest_(NULL),
|
||||
dfa_mem_(0),
|
||||
delete_dfa_(NULL),
|
||||
unbytemap_(NULL),
|
||||
onepass_nodes_(NULL),
|
||||
onepass_start_(NULL) {
|
||||
}
|
||||
|
||||
Prog::~Prog() {
|
||||
if (delete_dfa_) {
|
||||
if (dfa_first_)
|
||||
delete_dfa_(dfa_first_);
|
||||
if (dfa_longest_)
|
||||
delete_dfa_(dfa_longest_);
|
||||
}
|
||||
delete[] onepass_nodes_;
|
||||
delete[] inst_;
|
||||
delete[] unbytemap_;
|
||||
}
|
||||
|
||||
typedef SparseSet Workq;
|
||||
|
||||
static inline void AddToQueue(Workq* q, int id) {
|
||||
if (id != 0)
|
||||
q->insert(id);
|
||||
}
|
||||
|
||||
static string ProgToString(Prog* prog, Workq* q) {
|
||||
string s;
|
||||
|
||||
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
||||
int id = *i;
|
||||
Prog::Inst* ip = prog->inst(id);
|
||||
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
||||
AddToQueue(q, ip->out());
|
||||
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
|
||||
AddToQueue(q, ip->out1());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
string Prog::Dump() {
|
||||
string map;
|
||||
if (false) { // Debugging
|
||||
int lo = 0;
|
||||
StringAppendF(&map, "byte map:\n");
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
|
||||
lo = unbytemap_[i] + 1;
|
||||
}
|
||||
StringAppendF(&map, "\n");
|
||||
}
|
||||
|
||||
Workq q(size_);
|
||||
AddToQueue(&q, start_);
|
||||
return map + ProgToString(this, &q);
|
||||
}
|
||||
|
||||
string Prog::DumpUnanchored() {
|
||||
Workq q(size_);
|
||||
AddToQueue(&q, start_unanchored_);
|
||||
return ProgToString(this, &q);
|
||||
}
|
||||
|
||||
static bool IsMatch(Prog*, Prog::Inst*);
|
||||
|
||||
// Peep-hole optimizer.
|
||||
void Prog::Optimize() {
|
||||
Workq q(size_);
|
||||
|
||||
// Eliminate nops. Most are taken out during compilation
|
||||
// but a few are hard to avoid.
|
||||
q.clear();
|
||||
AddToQueue(&q, start_);
|
||||
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
||||
int id = *i;
|
||||
|
||||
Inst* ip = inst(id);
|
||||
int j = ip->out();
|
||||
Inst* jp;
|
||||
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
||||
j = jp->out();
|
||||
}
|
||||
ip->set_out(j);
|
||||
AddToQueue(&q, ip->out());
|
||||
|
||||
if (ip->opcode() == kInstAlt) {
|
||||
j = ip->out1();
|
||||
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
||||
j = jp->out();
|
||||
}
|
||||
ip->out1_ = j;
|
||||
AddToQueue(&q, ip->out1());
|
||||
}
|
||||
}
|
||||
|
||||
// Insert kInstAltMatch instructions
|
||||
// Look for
|
||||
// ip: Alt -> j | k
|
||||
// j: ByteRange [00-FF] -> ip
|
||||
// k: Match
|
||||
// or the reverse (the above is the greedy one).
|
||||
// Rewrite Alt to AltMatch.
|
||||
q.clear();
|
||||
AddToQueue(&q, start_);
|
||||
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
||||
int id = *i;
|
||||
Inst* ip = inst(id);
|
||||
AddToQueue(&q, ip->out());
|
||||
if (ip->opcode() == kInstAlt)
|
||||
AddToQueue(&q, ip->out1());
|
||||
|
||||
if (ip->opcode() == kInstAlt) {
|
||||
Inst* j = inst(ip->out());
|
||||
Inst* k = inst(ip->out1());
|
||||
if (j->opcode() == kInstByteRange && j->out() == id &&
|
||||
j->lo() == 0x00 && j->hi() == 0xFF &&
|
||||
IsMatch(this, k)) {
|
||||
ip->set_opcode(kInstAltMatch);
|
||||
continue;
|
||||
}
|
||||
if (IsMatch(this, j) &&
|
||||
k->opcode() == kInstByteRange && k->out() == id &&
|
||||
k->lo() == 0x00 && k->hi() == 0xFF) {
|
||||
ip->set_opcode(kInstAltMatch);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Is ip a guaranteed match at end of text, perhaps after some capturing?
|
||||
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
|
||||
for (;;) {
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
|
||||
return false;
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
case kInstByteRange:
|
||||
case kInstFail:
|
||||
case kInstEmptyWidth:
|
||||
return false;
|
||||
|
||||
case kInstCapture:
|
||||
case kInstNop:
|
||||
ip = prog->inst(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
||||
int flags = 0;
|
||||
|
||||
// ^ and \A
|
||||
if (p == text.begin())
|
||||
flags |= kEmptyBeginText | kEmptyBeginLine;
|
||||
else if (p[-1] == '\n')
|
||||
flags |= kEmptyBeginLine;
|
||||
|
||||
// $ and \z
|
||||
if (p == text.end())
|
||||
flags |= kEmptyEndText | kEmptyEndLine;
|
||||
else if (p < text.end() && p[0] == '\n')
|
||||
flags |= kEmptyEndLine;
|
||||
|
||||
// \b and \B
|
||||
if (p == text.begin() && p == text.end()) {
|
||||
// no word boundary here
|
||||
} else if (p == text.begin()) {
|
||||
if (IsWordChar(p[0]))
|
||||
flags |= kEmptyWordBoundary;
|
||||
} else if (p == text.end()) {
|
||||
if (IsWordChar(p[-1]))
|
||||
flags |= kEmptyWordBoundary;
|
||||
} else {
|
||||
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
|
||||
flags |= kEmptyWordBoundary;
|
||||
}
|
||||
if (!(flags & kEmptyWordBoundary))
|
||||
flags |= kEmptyNonWordBoundary;
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
void Prog::MarkByteRange(int lo, int hi) {
|
||||
CHECK_GE(lo, 0);
|
||||
CHECK_GE(hi, 0);
|
||||
CHECK_LE(lo, 255);
|
||||
CHECK_LE(hi, 255);
|
||||
if (lo > 0)
|
||||
byterange_.Set(lo - 1);
|
||||
byterange_.Set(hi);
|
||||
}
|
||||
|
||||
void Prog::ComputeByteMap() {
|
||||
// Fill in bytemap with byte classes for prog_.
|
||||
// Ranges of bytes that are treated as indistinguishable
|
||||
// by the regexp program are mapped to a single byte class.
|
||||
// The vector prog_->byterange() marks the end of each
|
||||
// such range.
|
||||
const Bitmap<256>& v = byterange();
|
||||
|
||||
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
|
||||
uint8 n = 0;
|
||||
uint32 bits = 0;
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if ((i&31) == 0)
|
||||
bits = v.Word(i >> 5);
|
||||
bytemap_[i] = n;
|
||||
n += bits & 1;
|
||||
bits >>= 1;
|
||||
}
|
||||
bytemap_range_ = bytemap_[255] + 1;
|
||||
unbytemap_ = new uint8[bytemap_range_];
|
||||
for (int i = 0; i < 256; i++)
|
||||
unbytemap_[bytemap_[i]] = i;
|
||||
|
||||
if (0) { // For debugging: use trivial byte map.
|
||||
for (int i = 0; i < 256; i++) {
|
||||
bytemap_[i] = i;
|
||||
unbytemap_[i] = i;
|
||||
}
|
||||
bytemap_range_ = 256;
|
||||
LOG(INFO) << "Using trivial bytemap.";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,376 @@
|
|||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Compiled representation of regular expressions.
|
||||
// See regexp.h for the Regexp class, which represents a regular
|
||||
// expression symbolically.
|
||||
|
||||
#ifndef RE2_PROG_H__
|
||||
#define RE2_PROG_H__
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Simple fixed-size bitmap.
|
||||
template<int Bits>
|
||||
class Bitmap {
|
||||
public:
|
||||
Bitmap() { Reset(); }
|
||||
int Size() { return Bits; }
|
||||
|
||||
void Reset() {
|
||||
for (int i = 0; i < Words; i++)
|
||||
w_[i] = 0;
|
||||
}
|
||||
bool Get(int k) const {
|
||||
return w_[k >> WordLog] & (1<<(k & 31));
|
||||
}
|
||||
void Set(int k) {
|
||||
w_[k >> WordLog] |= 1<<(k & 31);
|
||||
}
|
||||
void Clear(int k) {
|
||||
w_[k >> WordLog] &= ~(1<<(k & 31));
|
||||
}
|
||||
uint32 Word(int i) const {
|
||||
return w_[i];
|
||||
}
|
||||
|
||||
private:
|
||||
static const int WordLog = 5;
|
||||
static const int Words = (Bits+31)/32;
|
||||
uint32 w_[Words];
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
|
||||
};
|
||||
|
||||
|
||||
// Opcodes for Inst
|
||||
enum InstOp {
|
||||
kInstAlt = 0, // choose between out_ and out1_
|
||||
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
|
||||
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
|
||||
kInstCapture, // capturing parenthesis number cap_
|
||||
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
|
||||
kInstMatch, // found a match!
|
||||
kInstNop, // no-op; occasionally unavoidable
|
||||
kInstFail, // never match; occasionally unavoidable
|
||||
};
|
||||
|
||||
// Bit flags for empty-width specials
|
||||
enum EmptyOp {
|
||||
kEmptyBeginLine = 1<<0, // ^ - beginning of line
|
||||
kEmptyEndLine = 1<<1, // $ - end of line
|
||||
kEmptyBeginText = 1<<2, // \A - beginning of text
|
||||
kEmptyEndText = 1<<3, // \z - end of text
|
||||
kEmptyWordBoundary = 1<<4, // \b - word boundary
|
||||
kEmptyNonWordBoundary = 1<<5, // \B - not \b
|
||||
kEmptyAllFlags = (1<<6)-1,
|
||||
};
|
||||
|
||||
class Regexp;
|
||||
|
||||
class DFA;
|
||||
struct OneState;
|
||||
|
||||
// Compiled form of regexp program.
|
||||
class Prog {
|
||||
public:
|
||||
Prog();
|
||||
~Prog();
|
||||
|
||||
// Single instruction in regexp program.
|
||||
class Inst {
|
||||
public:
|
||||
Inst() : out_opcode_(0), out1_(0) { }
|
||||
|
||||
// Constructors per opcode
|
||||
void InitAlt(uint32 out, uint32 out1);
|
||||
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
|
||||
void InitCapture(int cap, uint32 out);
|
||||
void InitEmptyWidth(EmptyOp empty, uint32 out);
|
||||
void InitMatch(int id);
|
||||
void InitNop(uint32 out);
|
||||
void InitFail();
|
||||
|
||||
// Getters
|
||||
int id(Prog* p) { return this - p->inst_; }
|
||||
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
||||
int out() { return out_opcode_>>3; }
|
||||
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
||||
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
||||
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
||||
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
|
||||
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
||||
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
||||
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
||||
bool greedy(Prog *p) {
|
||||
DCHECK_EQ(opcode(), kInstAltMatch);
|
||||
return p->inst(out())->opcode() == kInstByteRange;
|
||||
}
|
||||
|
||||
// Does this inst (an kInstByteRange) match c?
|
||||
inline bool Matches(int c) {
|
||||
DCHECK_EQ(opcode(), kInstByteRange);
|
||||
if (foldcase_ && 'A' <= c && c <= 'Z')
|
||||
c += 'a' - 'A';
|
||||
return lo_ <= c && c <= hi_;
|
||||
}
|
||||
|
||||
// Returns string representation for debugging.
|
||||
string Dump();
|
||||
|
||||
// Maximum instruction id.
|
||||
// (Must fit in out_opcode_, and PatchList steals another bit.)
|
||||
static const int kMaxInst = (1<<28) - 1;
|
||||
|
||||
private:
|
||||
void set_opcode(InstOp opcode) {
|
||||
out_opcode_ = (out()<<3) | opcode;
|
||||
}
|
||||
|
||||
void set_out(int out) {
|
||||
out_opcode_ = (out<<3) | opcode();
|
||||
}
|
||||
|
||||
void set_out_opcode(int out, InstOp opcode) {
|
||||
out_opcode_ = (out<<3) | opcode;
|
||||
}
|
||||
|
||||
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
|
||||
union { // additional instruction arguments:
|
||||
uint32 out1_; // opcode == kInstAlt
|
||||
// alternate next instruction
|
||||
|
||||
int32 cap_; // opcode == kInstCapture
|
||||
// Index of capture register (holds text
|
||||
// position recorded by capturing parentheses).
|
||||
// For \n (the submatch for the nth parentheses),
|
||||
// the left parenthesis captures into register 2*n
|
||||
// and the right one captures into register 2*n+1.
|
||||
|
||||
int32 match_id_; // opcode == kInstMatch
|
||||
// Match ID to identify this match (for re2::Set).
|
||||
|
||||
struct { // opcode == kInstByteRange
|
||||
uint8 lo_; // byte range is lo_-hi_ inclusive
|
||||
uint8 hi_; //
|
||||
uint8 foldcase_; // convert A-Z to a-z before checking range.
|
||||
};
|
||||
|
||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
||||
// empty_ is bitwise OR of kEmpty* flags above.
|
||||
};
|
||||
|
||||
friend class Compiler;
|
||||
friend struct PatchList;
|
||||
friend class Prog;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Inst);
|
||||
};
|
||||
|
||||
// Whether to anchor the search.
|
||||
enum Anchor {
|
||||
kUnanchored, // match anywhere
|
||||
kAnchored, // match only starting at beginning of text
|
||||
};
|
||||
|
||||
// Kind of match to look for (for anchor != kFullMatch)
|
||||
//
|
||||
// kLongestMatch mode finds the overall longest
|
||||
// match but still makes its submatch choices the way
|
||||
// Perl would, not in the way prescribed by POSIX.
|
||||
// The POSIX rules are much more expensive to implement,
|
||||
// and no one has needed them.
|
||||
//
|
||||
// kFullMatch is not strictly necessary -- we could use
|
||||
// kLongestMatch and then check the length of the match -- but
|
||||
// the matching code can run faster if it knows to consider only
|
||||
// full matches.
|
||||
enum MatchKind {
|
||||
kFirstMatch, // like Perl, PCRE
|
||||
kLongestMatch, // like egrep or POSIX
|
||||
kFullMatch, // match only entire text; implies anchor==kAnchored
|
||||
kManyMatch // for SearchDFA, records set of matches
|
||||
};
|
||||
|
||||
Inst *inst(int id) { return &inst_[id]; }
|
||||
int start() { return start_; }
|
||||
int start_unanchored() { return start_unanchored_; }
|
||||
void set_start(int start) { start_ = start; }
|
||||
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
||||
int64 size() { return size_; }
|
||||
bool reversed() { return reversed_; }
|
||||
void set_reversed(bool reversed) { reversed_ = reversed; }
|
||||
int64 byte_inst_count() { return byte_inst_count_; }
|
||||
const Bitmap<256>& byterange() { return byterange_; }
|
||||
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
|
||||
int64 dfa_mem() { return dfa_mem_; }
|
||||
int flags() { return flags_; }
|
||||
void set_flags(int flags) { flags_ = flags; }
|
||||
bool anchor_start() { return anchor_start_; }
|
||||
void set_anchor_start(bool b) { anchor_start_ = b; }
|
||||
bool anchor_end() { return anchor_end_; }
|
||||
void set_anchor_end(bool b) { anchor_end_ = b; }
|
||||
int bytemap_range() { return bytemap_range_; }
|
||||
const uint8* bytemap() { return bytemap_; }
|
||||
|
||||
// Returns string representation of program for debugging.
|
||||
string Dump();
|
||||
string DumpUnanchored();
|
||||
|
||||
// Record that at some point in the prog, the bytes in the range
|
||||
// lo-hi (inclusive) are treated as different from bytes outside the range.
|
||||
// Tracking this lets the DFA collapse commonly-treated byte ranges
|
||||
// when recording state pointers, greatly reducing its memory footprint.
|
||||
void MarkByteRange(int lo, int hi);
|
||||
|
||||
// Returns the set of kEmpty flags that are in effect at
|
||||
// position p within context.
|
||||
static uint32 EmptyFlags(const StringPiece& context, const char* p);
|
||||
|
||||
// Returns whether byte c is a word character: ASCII only.
|
||||
// Used by the implementation of \b and \B.
|
||||
// This is not right for Unicode, but:
|
||||
// - it's hard to get right in a byte-at-a-time matching world
|
||||
// (the DFA has only one-byte lookahead).
|
||||
// - even if the lookahead were possible, the Progs would be huge.
|
||||
// This crude approximation is the same one PCRE uses.
|
||||
static bool IsWordChar(uint8 c) {
|
||||
return ('A' <= c && c <= 'Z') ||
|
||||
('a' <= c && c <= 'z') ||
|
||||
('0' <= c && c <= '9') ||
|
||||
c == '_';
|
||||
}
|
||||
|
||||
// Execution engines. They all search for the regexp (run the prog)
|
||||
// in text, which is in the larger context (used for ^ $ \b etc).
|
||||
// Anchor and kind control the kind of search.
|
||||
// Returns true if match found, false if not.
|
||||
// If match found, fills match[0..nmatch-1] with submatch info.
|
||||
// match[0] is overall match, match[1] is first set of parens, etc.
|
||||
// If a particular submatch is not matched during the regexp match,
|
||||
// it is set to NULL.
|
||||
//
|
||||
// Matching text == StringPiece(NULL, 0) is treated as any other empty
|
||||
// string, but note that on return, it will not be possible to distinguish
|
||||
// submatches that matched that empty string from submatches that didn't
|
||||
// match anything. Either way, match[i] == NULL.
|
||||
|
||||
// Search using NFA: can find submatches but kind of slow.
|
||||
bool SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
// Search using DFA: much faster than NFA but only finds
|
||||
// end of match and can use a lot more memory.
|
||||
// Returns whether a match was found.
|
||||
// If the DFA runs out of memory, sets *failed to true and returns false.
|
||||
// If matches != NULL and kind == kManyMatch and there is a match,
|
||||
// SearchDFA fills matches with the match IDs of the final matching state.
|
||||
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match0, bool* failed,
|
||||
vector<int>* matches);
|
||||
|
||||
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
|
||||
// Usually the DFA is built out incrementally, as needed, which
|
||||
// avoids lots of unnecessary work. This function is useful only
|
||||
// for testing purposes. Returns number of states.
|
||||
int BuildEntireDFA(MatchKind kind);
|
||||
|
||||
// Compute byte map.
|
||||
void ComputeByteMap();
|
||||
|
||||
// Run peep-hole optimizer on program.
|
||||
void Optimize();
|
||||
|
||||
// One-pass NFA: only correct if IsOnePass() is true,
|
||||
// but much faster than NFA (competitive with PCRE)
|
||||
// for those expressions.
|
||||
bool IsOnePass();
|
||||
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
// Bit-state backtracking. Fast on small cases but uses memory
|
||||
// proportional to the product of the program size and the text size.
|
||||
bool SearchBitState(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
static const int kMaxOnePassCapture = 5; // $0 through $4
|
||||
|
||||
// Backtracking search: the gold standard against which the other
|
||||
// implementations are checked. FOR TESTING ONLY.
|
||||
// It allocates a ton of memory to avoid running forever.
|
||||
// It is also recursive, so can't use in production (will overflow stacks).
|
||||
// The name "Unsafe" here is supposed to be a flag that
|
||||
// you should not be using this function.
|
||||
bool UnsafeSearchBacktrack(const StringPiece& text,
|
||||
const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
// Computes range for any strings matching regexp. The min and max can in
|
||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
||||
// maximum desired length of string returned.
|
||||
//
|
||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
||||
// string s that is an anchored match for this regexp satisfies
|
||||
// min <= s && s <= max.
|
||||
//
|
||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
||||
// do not compile down to infinite repetitions.
|
||||
//
|
||||
// Returns true on success, false on error.
|
||||
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
||||
|
||||
// Compiles a collection of regexps to Prog. Each regexp will have
|
||||
// its own Match instruction recording the index in the vector.
|
||||
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
Regexp* re);
|
||||
|
||||
private:
|
||||
friend class Compiler;
|
||||
|
||||
DFA* GetDFA(MatchKind kind);
|
||||
|
||||
bool anchor_start_; // regexp has explicit start anchor
|
||||
bool anchor_end_; // regexp has explicit end anchor
|
||||
bool reversed_; // whether program runs backward over input
|
||||
bool did_onepass_; // has IsOnePass been called?
|
||||
|
||||
int start_; // entry point for program
|
||||
int start_unanchored_; // unanchored entry point for program
|
||||
int size_; // number of instructions
|
||||
int byte_inst_count_; // number of kInstByteRange instructions
|
||||
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
||||
int flags_; // regexp parse flags
|
||||
int onepass_statesize_; // byte size of each OneState* node
|
||||
|
||||
Inst* inst_; // pointer to instruction array
|
||||
|
||||
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
|
||||
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
|
||||
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
|
||||
int64 dfa_mem_; // Maximum memory for DFAs.
|
||||
void (*delete_dfa_)(DFA* dfa);
|
||||
|
||||
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
|
||||
// commonly-treated byte range.
|
||||
uint8 bytemap_[256]; // map from input bytes to byte classes
|
||||
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
|
||||
|
||||
uint8* onepass_nodes_; // data for OnePass nodes
|
||||
OneState* onepass_start_; // start node for OnePass program
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Prog);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PROG_H__
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,837 @@
|
|||
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_RE2_H
|
||||
#define RE2_RE2_H
|
||||
|
||||
// C++ interface to the re2 regular-expression library.
|
||||
// RE2 supports Perl-style regular expressions (with extensions like
|
||||
// \d, \w, \s, ...).
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// REGEXP SYNTAX:
|
||||
//
|
||||
// This module uses the re2 library and hence supports
|
||||
// its syntax for regular expressions, which is similar to Perl's with
|
||||
// some of the more complicated things thrown away. In particular,
|
||||
// backreferences and generalized assertions are not available, nor is \Z.
|
||||
//
|
||||
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
|
||||
// supported by RE2, and a comparison with PCRE and PERL regexps.
|
||||
//
|
||||
// For those not familiar with Perl's regular expressions,
|
||||
// here are some examples of the most commonly used extensions:
|
||||
//
|
||||
// "hello (\\w+) world" -- \w matches a "word" character
|
||||
// "version (\\d+)" -- \d matches a digit
|
||||
// "hello\\s+world" -- \s matches any whitespace character
|
||||
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
|
||||
// "(?i)hello" -- (?i) turns on case-insensitive matching
|
||||
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// MATCHING INTERFACE:
|
||||
//
|
||||
// The "FullMatch" operation checks that supplied text matches a
|
||||
// supplied pattern exactly.
|
||||
//
|
||||
// Example: successful match
|
||||
// CHECK(RE2::FullMatch("hello", "h.*o"));
|
||||
//
|
||||
// Example: unsuccessful match (requires full match):
|
||||
// CHECK(!RE2::FullMatch("hello", "e"));
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// UTF-8 AND THE MATCHING INTERFACE:
|
||||
//
|
||||
// By default, the pattern and input text are interpreted as UTF-8.
|
||||
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
|
||||
//
|
||||
// Example:
|
||||
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
|
||||
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// MATCHING WITH SUB-STRING EXTRACTION:
|
||||
//
|
||||
// You can supply extra pointer arguments to extract matched subpieces.
|
||||
//
|
||||
// Example: extracts "ruby" into "s" and 1234 into "i"
|
||||
// int i;
|
||||
// string s;
|
||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
|
||||
//
|
||||
// Example: fails because string cannot be stored in integer
|
||||
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
|
||||
//
|
||||
// Example: fails because there aren't enough sub-patterns:
|
||||
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
|
||||
//
|
||||
// Example: does not try to extract any extra sub-patterns
|
||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
|
||||
//
|
||||
// Example: does not try to extract into NULL
|
||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
|
||||
//
|
||||
// Example: integer overflow causes failure
|
||||
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
|
||||
//
|
||||
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
|
||||
// This may get a little faster in the future, but right now is slower
|
||||
// than PCRE. On the other hand, failed matches run *very* fast (faster
|
||||
// than PCRE), as do matches without substring extraction.
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// PARTIAL MATCHES
|
||||
//
|
||||
// You can use the "PartialMatch" operation when you want the pattern
|
||||
// to match any substring of the text.
|
||||
//
|
||||
// Example: simple search for a string:
|
||||
// CHECK(RE2::PartialMatch("hello", "ell"));
|
||||
//
|
||||
// Example: find first number in a string
|
||||
// int number;
|
||||
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
|
||||
// CHECK_EQ(number, 100);
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// PRE-COMPILED REGULAR EXPRESSIONS
|
||||
//
|
||||
// RE2 makes it easy to use any string as a regular expression, without
|
||||
// requiring a separate compilation step.
|
||||
//
|
||||
// If speed is of the essence, you can create a pre-compiled "RE2"
|
||||
// object from the pattern and use it multiple times. If you do so,
|
||||
// you can typically parse text faster than with sscanf.
|
||||
//
|
||||
// Example: precompile pattern for faster matching:
|
||||
// RE2 pattern("h.*o");
|
||||
// while (ReadLine(&str)) {
|
||||
// if (RE2::FullMatch(str, pattern)) ...;
|
||||
// }
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// SCANNING TEXT INCREMENTALLY
|
||||
//
|
||||
// The "Consume" operation may be useful if you want to repeatedly
|
||||
// match regular expressions at the front of a string and skip over
|
||||
// them as they match. This requires use of the "StringPiece" type,
|
||||
// which represents a sub-range of a real string.
|
||||
//
|
||||
// Example: read lines of the form "var = value" from a string.
|
||||
// string contents = ...; // Fill string somehow
|
||||
// StringPiece input(contents); // Wrap a StringPiece around it
|
||||
//
|
||||
// string var;
|
||||
// int value;
|
||||
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
|
||||
// ...;
|
||||
// }
|
||||
//
|
||||
// Each successful call to "Consume" will set "var/value", and also
|
||||
// advance "input" so it points past the matched text. Note that if the
|
||||
// regular expression matches an empty string, input will advance
|
||||
// by 0 bytes. If the regular expression being used might match
|
||||
// an empty string, the loop body must check for this case and either
|
||||
// advance the string or break out of the loop.
|
||||
//
|
||||
// The "FindAndConsume" operation is similar to "Consume" but does not
|
||||
// anchor your match at the beginning of the string. For example, you
|
||||
// could extract all words from a string by repeatedly calling
|
||||
// RE2::FindAndConsume(&input, "(\\w+)", &word)
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// USING VARIABLE NUMBER OF ARGUMENTS
|
||||
//
|
||||
// The above operations require you to know the number of arguments
|
||||
// when you write the code. This is not always possible or easy (for
|
||||
// example, the regular expression may be calculated at run time).
|
||||
// You can use the "N" version of the operations when the number of
|
||||
// match arguments are determined at run time.
|
||||
//
|
||||
// Example:
|
||||
// const RE2::Arg* args[10];
|
||||
// int n;
|
||||
// // ... populate args with pointers to RE2::Arg values ...
|
||||
// // ... set n to the number of RE2::Arg objects ...
|
||||
// bool match = RE2::FullMatchN(input, pattern, args, n);
|
||||
//
|
||||
// The last statement is equivalent to
|
||||
//
|
||||
// bool match = RE2::FullMatch(input, pattern,
|
||||
// *args[0], *args[1], ..., *args[n - 1]);
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// PARSING HEX/OCTAL/C-RADIX NUMBERS
|
||||
//
|
||||
// By default, if you pass a pointer to a numeric value, the
|
||||
// corresponding text is interpreted as a base-10 number. You can
|
||||
// instead wrap the pointer with a call to one of the operators Hex(),
|
||||
// Octal(), or CRadix() to interpret the text in another base. The
|
||||
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
|
||||
// prefixes, but defaults to base-10.
|
||||
//
|
||||
// Example:
|
||||
// int a, b, c, d;
|
||||
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
|
||||
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
|
||||
// will leave 64 in a, b, c, and d.
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/variadic_function.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::string;
|
||||
using std::map;
|
||||
class Mutex;
|
||||
class Prog;
|
||||
class Regexp;
|
||||
|
||||
// Interface for regular expression matching. Also corresponds to a
|
||||
// pre-compiled regular expression. An "RE2" object is safe for
|
||||
// concurrent use by multiple threads.
|
||||
class RE2 {
|
||||
public:
|
||||
// We convert user-passed pointers into special Arg objects
|
||||
class Arg;
|
||||
class Options;
|
||||
|
||||
// Defined in set.h.
|
||||
class Set;
|
||||
|
||||
enum ErrorCode {
|
||||
NoError = 0,
|
||||
|
||||
// Unexpected error
|
||||
ErrorInternal,
|
||||
|
||||
// Parse errors
|
||||
ErrorBadEscape, // bad escape sequence
|
||||
ErrorBadCharClass, // bad character class
|
||||
ErrorBadCharRange, // bad character class range
|
||||
ErrorMissingBracket, // missing closing ]
|
||||
ErrorMissingParen, // missing closing )
|
||||
ErrorTrailingBackslash, // trailing \ at end of regexp
|
||||
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
|
||||
ErrorRepeatSize, // bad repetition argument
|
||||
ErrorRepeatOp, // bad repetition operator
|
||||
ErrorBadPerlOp, // bad perl operator
|
||||
ErrorBadUTF8, // invalid UTF-8 in regexp
|
||||
ErrorBadNamedCapture, // bad named capture group
|
||||
ErrorPatternTooLarge, // pattern too large (compile failed)
|
||||
};
|
||||
|
||||
// Predefined common options.
|
||||
// If you need more complicated things, instantiate
|
||||
// an Option class, change the settings, and pass it to the
|
||||
// RE2 constructor.
|
||||
static const Options DefaultOptions;
|
||||
static const Options Latin1; // treat input as Latin-1 (default UTF-8)
|
||||
static const Options POSIX; // POSIX syntax, leftmost-longest match
|
||||
static const Options Quiet; // do not log about regexp parse errors
|
||||
|
||||
// Need to have the const char* and const string& forms for implicit
|
||||
// conversions when passing string literals to FullMatch and PartialMatch.
|
||||
// Otherwise the StringPiece form would be sufficient.
|
||||
#ifndef SWIG
|
||||
RE2(const char* pattern);
|
||||
RE2(const string& pattern);
|
||||
#endif
|
||||
RE2(const StringPiece& pattern);
|
||||
RE2(const StringPiece& pattern, const Options& option);
|
||||
~RE2();
|
||||
|
||||
// Returns whether RE2 was created properly.
|
||||
bool ok() const { return error_code() == NoError; }
|
||||
|
||||
// The string specification for this RE2. E.g.
|
||||
// RE2 re("ab*c?d+");
|
||||
// re.pattern(); // "ab*c?d+"
|
||||
const string& pattern() const { return pattern_; }
|
||||
|
||||
// If RE2 could not be created properly, returns an error string.
|
||||
// Else returns the empty string.
|
||||
const string& error() const { return *error_; }
|
||||
|
||||
// If RE2 could not be created properly, returns an error code.
|
||||
// Else returns RE2::NoError (== 0).
|
||||
ErrorCode error_code() const { return error_code_; }
|
||||
|
||||
// If RE2 could not be created properly, returns the offending
|
||||
// portion of the regexp.
|
||||
const string& error_arg() const { return error_arg_; }
|
||||
|
||||
// Returns the program size, a very approximate measure of a regexp's "cost".
|
||||
// Larger numbers are more expensive than smaller numbers.
|
||||
int ProgramSize() const;
|
||||
|
||||
// Returns the underlying Regexp; not for general use.
|
||||
// Returns entire_regexp_ so that callers don't need
|
||||
// to know about prefix_ and prefix_foldcase_.
|
||||
re2::Regexp* Regexp() const { return entire_regexp_; }
|
||||
|
||||
/***** The useful part: the matching interface *****/
|
||||
|
||||
// Matches "text" against "pattern". If pointer arguments are
|
||||
// supplied, copies matched sub-patterns into them.
|
||||
//
|
||||
// You can pass in a "const char*" or a "string" for "text".
|
||||
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
|
||||
//
|
||||
// The provided pointer arguments can be pointers to any scalar numeric
|
||||
// type, or one of:
|
||||
// string (matched piece is copied to string)
|
||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
||||
//
|
||||
// Returns true iff all of the following conditions are satisfied:
|
||||
// a. "text" matches "pattern" exactly
|
||||
// b. The number of matched sub-patterns is >= number of supplied pointers
|
||||
// c. The "i"th argument has a suitable type for holding the
|
||||
// string captured as the "i"th sub-pattern. If you pass in
|
||||
// NULL for the "i"th argument, or pass fewer arguments than
|
||||
// number of sub-patterns, "i"th captured sub-pattern is
|
||||
// ignored.
|
||||
//
|
||||
// CAVEAT: An optional sub-pattern that does not exist in the
|
||||
// matched string is assigned the empty string. Therefore, the
|
||||
// following will return false (because the empty string is not a
|
||||
// valid number):
|
||||
// int number;
|
||||
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
||||
static bool FullMatchN(const StringPiece& text, const RE2& re,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
|
||||
|
||||
// Exactly like FullMatch(), except that "pattern" is allowed to match
|
||||
// a substring of "text".
|
||||
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
|
||||
|
||||
// Like FullMatch() and PartialMatch(), except that pattern has to
|
||||
// match a prefix of "text", and "input" is advanced past the matched
|
||||
// text. Note: "input" is modified iff this routine returns true.
|
||||
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
|
||||
|
||||
// Like Consume(..), but does not anchor the match at the beginning of the
|
||||
// string. That is, "pattern" need not start its match at the beginning of
|
||||
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
|
||||
// word in "s" and stores it in "word".
|
||||
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
|
||||
|
||||
// Replace the first match of "pattern" in "str" with "rewrite".
|
||||
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
||||
// used to insert text matching corresponding parenthesized group
|
||||
// from the pattern. \0 in "rewrite" refers to the entire matching
|
||||
// text. E.g.,
|
||||
//
|
||||
// string s = "yabba dabba doo";
|
||||
// CHECK(RE2::Replace(&s, "b+", "d"));
|
||||
//
|
||||
// will leave "s" containing "yada dabba doo"
|
||||
//
|
||||
// Returns true if the pattern matches and a replacement occurs,
|
||||
// false otherwise.
|
||||
static bool Replace(string *str,
|
||||
const RE2& pattern,
|
||||
const StringPiece& rewrite);
|
||||
|
||||
// Like Replace(), except replaces successive non-overlapping occurrences
|
||||
// of the pattern in the string with the rewrite. E.g.
|
||||
//
|
||||
// string s = "yabba dabba doo";
|
||||
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
|
||||
//
|
||||
// will leave "s" containing "yada dada doo"
|
||||
// Replacements are not subject to re-matching.
|
||||
//
|
||||
// Because GlobalReplace only replaces non-overlapping matches,
|
||||
// replacing "ana" within "banana" makes only one replacement, not two.
|
||||
//
|
||||
// Returns the number of replacements made.
|
||||
static int GlobalReplace(string *str,
|
||||
const RE2& pattern,
|
||||
const StringPiece& rewrite);
|
||||
|
||||
// Like Replace, except that if the pattern matches, "rewrite"
|
||||
// is copied into "out" with substitutions. The non-matching
|
||||
// portions of "text" are ignored.
|
||||
//
|
||||
// Returns true iff a match occurred and the extraction happened
|
||||
// successfully; if no match occurs, the string is left unaffected.
|
||||
static bool Extract(const StringPiece &text,
|
||||
const RE2& pattern,
|
||||
const StringPiece &rewrite,
|
||||
string *out);
|
||||
|
||||
// Escapes all potentially meaningful regexp characters in
|
||||
// 'unquoted'. The returned string, used as a regular expression,
|
||||
// will exactly match the original string. For example,
|
||||
// 1.5-2.0?
|
||||
// may become:
|
||||
// 1\.5\-2\.0\?
|
||||
static string QuoteMeta(const StringPiece& unquoted);
|
||||
|
||||
// Computes range for any strings matching regexp. The min and max can in
|
||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
||||
// maximum desired length of string returned.
|
||||
//
|
||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
||||
// string s that is an anchored match for this regexp satisfies
|
||||
// min <= s && s <= max.
|
||||
//
|
||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
||||
// do not compile down to infinite repetitions.
|
||||
//
|
||||
// Returns true on success, false on error.
|
||||
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
|
||||
|
||||
// Generic matching interface
|
||||
|
||||
// Type of match.
|
||||
enum Anchor {
|
||||
UNANCHORED, // No anchoring
|
||||
ANCHOR_START, // Anchor at start only
|
||||
ANCHOR_BOTH, // Anchor at start and end
|
||||
};
|
||||
|
||||
// Return the number of capturing subpatterns, or -1 if the
|
||||
// regexp wasn't valid on construction. The overall match ($0)
|
||||
// does not count: if the regexp is "(a)(b)", returns 2.
|
||||
int NumberOfCapturingGroups() const;
|
||||
|
||||
|
||||
// Return a map from names to capturing indices.
|
||||
// The map records the index of the leftmost group
|
||||
// with the given name.
|
||||
// Only valid until the re is deleted.
|
||||
const map<string, int>& NamedCapturingGroups() const;
|
||||
|
||||
// Return a map from capturing indices to names.
|
||||
// The map has no entries for unnamed groups.
|
||||
// Only valid until the re is deleted.
|
||||
const map<int, string>& CapturingGroupNames() const;
|
||||
|
||||
// General matching routine.
|
||||
// Match against text starting at offset startpos
|
||||
// and stopping the search at offset endpos.
|
||||
// Returns true if match found, false if not.
|
||||
// On a successful match, fills in match[] (up to nmatch entries)
|
||||
// with information about submatches.
|
||||
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
|
||||
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
|
||||
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
|
||||
//
|
||||
// Don't ask for more match information than you will use:
|
||||
// runs much faster with nmatch == 1 than nmatch > 1, and
|
||||
// runs even faster if nmatch == 0.
|
||||
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
|
||||
// but will be handled correctly.
|
||||
//
|
||||
// Passing text == StringPiece(NULL, 0) will be handled like any other
|
||||
// empty string, but note that on return, it will not be possible to tell
|
||||
// whether submatch i matched the empty string or did not match:
|
||||
// either way, match[i] == NULL.
|
||||
bool Match(const StringPiece& text,
|
||||
int startpos,
|
||||
int endpos,
|
||||
Anchor anchor,
|
||||
StringPiece *match,
|
||||
int nmatch) const;
|
||||
|
||||
// Check that the given rewrite string is suitable for use with this
|
||||
// regular expression. It checks that:
|
||||
// * The regular expression has enough parenthesized subexpressions
|
||||
// to satisfy all of the \N tokens in rewrite
|
||||
// * The rewrite string doesn't have any syntax errors. E.g.,
|
||||
// '\' followed by anything other than a digit or '\'.
|
||||
// A true return value guarantees that Replace() and Extract() won't
|
||||
// fail because of a bad rewrite string.
|
||||
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
|
||||
|
||||
// Constructor options
|
||||
class Options {
|
||||
public:
|
||||
// The options are (defaults in parentheses):
|
||||
//
|
||||
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
|
||||
// posix_syntax (false) restrict regexps to POSIX egrep syntax
|
||||
// longest_match (false) search for longest match, not first match
|
||||
// log_errors (true) log syntax and execution errors to ERROR
|
||||
// max_mem (see below) approx. max memory footprint of RE2
|
||||
// literal (false) interpret string as literal, not regexp
|
||||
// never_nl (false) never match \n, even if it is in regexp
|
||||
// case_sensitive (true) match is case-sensitive (regexp can override
|
||||
// with (?i) unless in posix_syntax mode)
|
||||
//
|
||||
// The following options are only consulted when posix_syntax == true.
|
||||
// (When posix_syntax == false these features are always enabled and
|
||||
// cannot be turned off.)
|
||||
// perl_classes (false) allow Perl's \d \s \w \D \S \W
|
||||
// word_boundary (false) allow Perl's \b \B (word boundary and not)
|
||||
// one_line (false) ^ and $ only match beginning and end of text
|
||||
//
|
||||
// The max_mem option controls how much memory can be used
|
||||
// to hold the compiled form of the regexp (the Prog) and
|
||||
// its cached DFA graphs. Code Search placed limits on the number
|
||||
// of Prog instructions and DFA states: 10,000 for both.
|
||||
// In RE2, those limits would translate to about 240 KB per Prog
|
||||
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
|
||||
// better job of keeping them small than Code Search did).
|
||||
// Each RE2 has two Progs (one forward, one reverse), and each Prog
|
||||
// can have two DFAs (one first match, one longest match).
|
||||
// That makes 4 DFAs:
|
||||
//
|
||||
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
|
||||
// if opt.longest_match() == false
|
||||
// forward, longest-match - used for all ANCHOR_BOTH searches,
|
||||
// and the other two kinds if
|
||||
// opt.longest_match() == true
|
||||
// reverse, first-match - never used
|
||||
// reverse, longest-match - used as second phase for unanchored searches
|
||||
//
|
||||
// The RE2 memory budget is statically divided between the two
|
||||
// Progs and then the DFAs: two thirds to the forward Prog
|
||||
// and one third to the reverse Prog. The forward Prog gives half
|
||||
// of what it has left over to each of its DFAs. The reverse Prog
|
||||
// gives it all to its longest-match DFA.
|
||||
//
|
||||
// Once a DFA fills its budget, it flushes its cache and starts over.
|
||||
// If this happens too often, RE2 falls back on the NFA implementation.
|
||||
|
||||
// For now, make the default budget something close to Code Search.
|
||||
static const int kDefaultMaxMem = 8<<20;
|
||||
|
||||
enum Encoding {
|
||||
EncodingUTF8 = 1,
|
||||
EncodingLatin1
|
||||
};
|
||||
|
||||
Options() :
|
||||
encoding_(EncodingUTF8),
|
||||
posix_syntax_(false),
|
||||
longest_match_(false),
|
||||
log_errors_(true),
|
||||
max_mem_(kDefaultMaxMem),
|
||||
literal_(false),
|
||||
never_nl_(false),
|
||||
case_sensitive_(true),
|
||||
perl_classes_(false),
|
||||
word_boundary_(false),
|
||||
one_line_(false) {
|
||||
}
|
||||
|
||||
Encoding encoding() const { return encoding_; }
|
||||
void set_encoding(Encoding encoding) { encoding_ = encoding; }
|
||||
|
||||
// Legacy interface to encoding.
|
||||
// TODO(rsc): Remove once clients have been converted.
|
||||
bool utf8() const { return encoding_ == EncodingUTF8; }
|
||||
void set_utf8(bool b) {
|
||||
if (b) {
|
||||
encoding_ = EncodingUTF8;
|
||||
} else {
|
||||
encoding_ = EncodingLatin1;
|
||||
}
|
||||
}
|
||||
|
||||
bool posix_syntax() const { return posix_syntax_; }
|
||||
void set_posix_syntax(bool b) { posix_syntax_ = b; }
|
||||
|
||||
bool longest_match() const { return longest_match_; }
|
||||
void set_longest_match(bool b) { longest_match_ = b; }
|
||||
|
||||
bool log_errors() const { return log_errors_; }
|
||||
void set_log_errors(bool b) { log_errors_ = b; }
|
||||
|
||||
int max_mem() const { return max_mem_; }
|
||||
void set_max_mem(int m) { max_mem_ = m; }
|
||||
|
||||
bool literal() const { return literal_; }
|
||||
void set_literal(bool b) { literal_ = b; }
|
||||
|
||||
bool never_nl() const { return never_nl_; }
|
||||
void set_never_nl(bool b) { never_nl_ = b; }
|
||||
|
||||
bool case_sensitive() const { return case_sensitive_; }
|
||||
void set_case_sensitive(bool b) { case_sensitive_ = b; }
|
||||
|
||||
bool perl_classes() const { return perl_classes_; }
|
||||
void set_perl_classes(bool b) { perl_classes_ = b; }
|
||||
|
||||
bool word_boundary() const { return word_boundary_; }
|
||||
void set_word_boundary(bool b) { word_boundary_ = b; }
|
||||
|
||||
bool one_line() const { return one_line_; }
|
||||
void set_one_line(bool b) { one_line_ = b; }
|
||||
|
||||
void Copy(const Options& src) {
|
||||
encoding_ = src.encoding_;
|
||||
posix_syntax_ = src.posix_syntax_;
|
||||
longest_match_ = src.longest_match_;
|
||||
log_errors_ = src.log_errors_;
|
||||
max_mem_ = src.max_mem_;
|
||||
literal_ = src.literal_;
|
||||
never_nl_ = src.never_nl_;
|
||||
case_sensitive_ = src.case_sensitive_;
|
||||
perl_classes_ = src.perl_classes_;
|
||||
word_boundary_ = src.word_boundary_;
|
||||
one_line_ = src.one_line_;
|
||||
}
|
||||
|
||||
int ParseFlags() const;
|
||||
|
||||
private:
|
||||
// Private constructor for defining constants like RE2::Latin1.
|
||||
friend class RE2;
|
||||
Options(Encoding encoding,
|
||||
bool posix_syntax,
|
||||
bool longest_match,
|
||||
bool log_errors) :
|
||||
encoding_(encoding),
|
||||
posix_syntax_(posix_syntax),
|
||||
longest_match_(longest_match),
|
||||
log_errors_(log_errors),
|
||||
max_mem_(kDefaultMaxMem),
|
||||
literal_(false),
|
||||
never_nl_(false),
|
||||
case_sensitive_(true),
|
||||
perl_classes_(false),
|
||||
word_boundary_(false),
|
||||
one_line_(false) {
|
||||
}
|
||||
|
||||
Encoding encoding_;
|
||||
bool posix_syntax_;
|
||||
bool longest_match_;
|
||||
bool log_errors_;
|
||||
int64_t max_mem_;
|
||||
bool literal_;
|
||||
bool never_nl_;
|
||||
bool case_sensitive_;
|
||||
bool perl_classes_;
|
||||
bool word_boundary_;
|
||||
bool one_line_;
|
||||
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(Options);
|
||||
Options(const Options&);
|
||||
void operator=(const Options&);
|
||||
};
|
||||
|
||||
// Returns the options set in the constructor.
|
||||
const Options& options() const { return options_; };
|
||||
|
||||
// Argument converters; see below.
|
||||
static inline Arg CRadix(short* x);
|
||||
static inline Arg CRadix(unsigned short* x);
|
||||
static inline Arg CRadix(int* x);
|
||||
static inline Arg CRadix(unsigned int* x);
|
||||
static inline Arg CRadix(long* x);
|
||||
static inline Arg CRadix(unsigned long* x);
|
||||
static inline Arg CRadix(long long* x);
|
||||
static inline Arg CRadix(unsigned long long* x);
|
||||
|
||||
static inline Arg Hex(short* x);
|
||||
static inline Arg Hex(unsigned short* x);
|
||||
static inline Arg Hex(int* x);
|
||||
static inline Arg Hex(unsigned int* x);
|
||||
static inline Arg Hex(long* x);
|
||||
static inline Arg Hex(unsigned long* x);
|
||||
static inline Arg Hex(long long* x);
|
||||
static inline Arg Hex(unsigned long long* x);
|
||||
|
||||
static inline Arg Octal(short* x);
|
||||
static inline Arg Octal(unsigned short* x);
|
||||
static inline Arg Octal(int* x);
|
||||
static inline Arg Octal(unsigned int* x);
|
||||
static inline Arg Octal(long* x);
|
||||
static inline Arg Octal(unsigned long* x);
|
||||
static inline Arg Octal(long long* x);
|
||||
static inline Arg Octal(unsigned long long* x);
|
||||
|
||||
private:
|
||||
void Init(const StringPiece& pattern, const Options& options);
|
||||
|
||||
bool Rewrite(string *out,
|
||||
const StringPiece &rewrite,
|
||||
const StringPiece* vec,
|
||||
int veclen) const;
|
||||
|
||||
bool DoMatch(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
const Arg* const args[],
|
||||
int n) const;
|
||||
|
||||
re2::Prog* ReverseProg() const;
|
||||
|
||||
mutable Mutex* mutex_;
|
||||
string pattern_; // string regular expression
|
||||
Options options_; // option flags
|
||||
string prefix_; // required prefix (before regexp_)
|
||||
bool prefix_foldcase_; // prefix is ASCII case-insensitive
|
||||
re2::Regexp* entire_regexp_; // parsed regular expression
|
||||
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
|
||||
re2::Prog* prog_; // compiled program for regexp
|
||||
mutable re2::Prog* rprog_; // reverse program for regexp
|
||||
bool is_one_pass_; // can use prog_->SearchOnePass?
|
||||
mutable const string* error_; // Error indicator
|
||||
// (or points to empty string)
|
||||
mutable ErrorCode error_code_; // Error code
|
||||
mutable string error_arg_; // Fragment of regexp showing error
|
||||
mutable int num_captures_; // Number of capturing groups
|
||||
|
||||
// Map from capture names to indices
|
||||
mutable const map<string, int>* named_groups_;
|
||||
|
||||
// Map from capture indices to names
|
||||
mutable const map<int, string>* group_names_;
|
||||
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
|
||||
RE2(const RE2&);
|
||||
void operator=(const RE2&);
|
||||
};
|
||||
|
||||
/***** Implementation details *****/
|
||||
|
||||
// Hex/Octal/Binary?
|
||||
|
||||
// Special class for parsing into objects that define a ParseFrom() method
|
||||
template <class T>
|
||||
class _RE2_MatchObject {
|
||||
public:
|
||||
static inline bool Parse(const char* str, int n, void* dest) {
|
||||
if (dest == NULL) return true;
|
||||
T* object = reinterpret_cast<T*>(dest);
|
||||
return object->ParseFrom(str, n);
|
||||
}
|
||||
};
|
||||
|
||||
class RE2::Arg {
|
||||
public:
|
||||
// Empty constructor so we can declare arrays of RE2::Arg
|
||||
Arg();
|
||||
|
||||
// Constructor specially designed for NULL arguments
|
||||
Arg(void*);
|
||||
|
||||
typedef bool (*Parser)(const char* str, int n, void* dest);
|
||||
|
||||
// Type-specific parsers
|
||||
#define MAKE_PARSER(type,name) \
|
||||
Arg(type* p) : arg_(p), parser_(name) { } \
|
||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
|
||||
|
||||
|
||||
MAKE_PARSER(char, parse_char);
|
||||
MAKE_PARSER(signed char, parse_char);
|
||||
MAKE_PARSER(unsigned char, parse_uchar);
|
||||
MAKE_PARSER(short, parse_short);
|
||||
MAKE_PARSER(unsigned short, parse_ushort);
|
||||
MAKE_PARSER(int, parse_int);
|
||||
MAKE_PARSER(unsigned int, parse_uint);
|
||||
MAKE_PARSER(long, parse_long);
|
||||
MAKE_PARSER(unsigned long, parse_ulong);
|
||||
MAKE_PARSER(long long, parse_longlong);
|
||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
||||
MAKE_PARSER(float, parse_float);
|
||||
MAKE_PARSER(double, parse_double);
|
||||
MAKE_PARSER(string, parse_string);
|
||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||
|
||||
#undef MAKE_PARSER
|
||||
|
||||
// Generic constructor
|
||||
template <class T> Arg(T*, Parser parser);
|
||||
// Generic constructor template
|
||||
template <class T> Arg(T* p)
|
||||
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
|
||||
}
|
||||
|
||||
// Parse the data
|
||||
bool Parse(const char* str, int n) const;
|
||||
|
||||
private:
|
||||
void* arg_;
|
||||
Parser parser_;
|
||||
|
||||
static bool parse_null (const char* str, int n, void* dest);
|
||||
static bool parse_char (const char* str, int n, void* dest);
|
||||
static bool parse_uchar (const char* str, int n, void* dest);
|
||||
static bool parse_float (const char* str, int n, void* dest);
|
||||
static bool parse_double (const char* str, int n, void* dest);
|
||||
static bool parse_string (const char* str, int n, void* dest);
|
||||
static bool parse_stringpiece (const char* str, int n, void* dest);
|
||||
|
||||
#define DECLARE_INTEGER_PARSER(name) \
|
||||
private: \
|
||||
static bool parse_ ## name(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _radix( \
|
||||
const char* str, int n, void* dest, int radix); \
|
||||
public: \
|
||||
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
|
||||
|
||||
DECLARE_INTEGER_PARSER(short);
|
||||
DECLARE_INTEGER_PARSER(ushort);
|
||||
DECLARE_INTEGER_PARSER(int);
|
||||
DECLARE_INTEGER_PARSER(uint);
|
||||
DECLARE_INTEGER_PARSER(long);
|
||||
DECLARE_INTEGER_PARSER(ulong);
|
||||
DECLARE_INTEGER_PARSER(longlong);
|
||||
DECLARE_INTEGER_PARSER(ulonglong);
|
||||
|
||||
#undef DECLARE_INTEGER_PARSER
|
||||
};
|
||||
|
||||
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
||||
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
||||
|
||||
inline bool RE2::Arg::Parse(const char* str, int n) const {
|
||||
return (*parser_)(str, n, arg_);
|
||||
}
|
||||
|
||||
// This part of the parser, appropriate only for ints, deals with bases
|
||||
#define MAKE_INTEGER_PARSER(type, name) \
|
||||
inline RE2::Arg RE2::Hex(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
|
||||
inline RE2::Arg RE2::Octal(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
|
||||
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
|
||||
|
||||
MAKE_INTEGER_PARSER(short, short);
|
||||
MAKE_INTEGER_PARSER(unsigned short, ushort);
|
||||
MAKE_INTEGER_PARSER(int, int);
|
||||
MAKE_INTEGER_PARSER(unsigned int, uint);
|
||||
MAKE_INTEGER_PARSER(long, long);
|
||||
MAKE_INTEGER_PARSER(unsigned long, ulong);
|
||||
MAKE_INTEGER_PARSER(long long, longlong);
|
||||
MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
|
||||
|
||||
#undef MAKE_INTEGER_PARSER
|
||||
|
||||
} // namespace re2
|
||||
|
||||
using re2::RE2;
|
||||
|
||||
#endif /* RE2_RE2_H */
|
|
@ -0,0 +1,920 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression representation.
|
||||
// Tested by parse_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Constructor. Allocates vectors as appropriate for operator.
|
||||
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
||||
: op_(op),
|
||||
simple_(false),
|
||||
parse_flags_(static_cast<uint16>(parse_flags)),
|
||||
ref_(1),
|
||||
nsub_(0),
|
||||
down_(NULL) {
|
||||
subone_ = NULL;
|
||||
memset(the_union_, 0, sizeof the_union_);
|
||||
}
|
||||
|
||||
// Destructor. Assumes already cleaned up children.
|
||||
// Private: use Decref() instead of delete to destroy Regexps.
|
||||
// Can't call Decref on the sub-Regexps here because
|
||||
// that could cause arbitrarily deep recursion, so
|
||||
// required Decref() to have handled them for us.
|
||||
Regexp::~Regexp() {
|
||||
if (nsub_ > 0)
|
||||
LOG(DFATAL) << "Regexp not destroyed.";
|
||||
|
||||
switch (op_) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpCapture:
|
||||
delete name_;
|
||||
break;
|
||||
case kRegexpLiteralString:
|
||||
delete[] runes_;
|
||||
break;
|
||||
case kRegexpCharClass:
|
||||
cc_->Delete();
|
||||
delete ccb_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If it's possible to destroy this regexp without recurring,
|
||||
// do so and return true. Else return false.
|
||||
bool Regexp::QuickDestroy() {
|
||||
if (nsub_ == 0) {
|
||||
delete this;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static map<Regexp*, int> ref_map;
|
||||
static Mutex ref_mutex;
|
||||
|
||||
int Regexp::Ref() {
|
||||
if (ref_ < kMaxRef)
|
||||
return ref_;
|
||||
|
||||
MutexLock l(&ref_mutex);
|
||||
return ref_map[this];
|
||||
}
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp* Regexp::Incref() {
|
||||
if (ref_ >= kMaxRef-1) {
|
||||
// Store ref count in overflow map.
|
||||
MutexLock l(&ref_mutex);
|
||||
if (ref_ == kMaxRef) { // already overflowed
|
||||
ref_map[this]++;
|
||||
return this;
|
||||
}
|
||||
// overflowing now
|
||||
ref_map[this] = kMaxRef;
|
||||
ref_ = kMaxRef;
|
||||
return this;
|
||||
}
|
||||
|
||||
ref_++;
|
||||
return this;
|
||||
}
|
||||
|
||||
// Decrements reference count and deletes this object if count reaches 0.
|
||||
void Regexp::Decref() {
|
||||
if (ref_ == kMaxRef) {
|
||||
// Ref count is stored in overflow map.
|
||||
MutexLock l(&ref_mutex);
|
||||
int r = ref_map[this] - 1;
|
||||
if (r < kMaxRef) {
|
||||
ref_ = r;
|
||||
ref_map.erase(this);
|
||||
} else {
|
||||
ref_map[this] = r;
|
||||
}
|
||||
return;
|
||||
}
|
||||
ref_--;
|
||||
if (ref_ == 0)
|
||||
Destroy();
|
||||
}
|
||||
|
||||
// Deletes this object; ref count has count reached 0.
|
||||
void Regexp::Destroy() {
|
||||
if (QuickDestroy())
|
||||
return;
|
||||
|
||||
// Handle recursive Destroy with explicit stack
|
||||
// to avoid arbitrarily deep recursion on process stack [sigh].
|
||||
down_ = NULL;
|
||||
Regexp* stack = this;
|
||||
while (stack != NULL) {
|
||||
Regexp* re = stack;
|
||||
stack = re->down_;
|
||||
if (re->ref_ != 0)
|
||||
LOG(DFATAL) << "Bad reference count " << re->ref_;
|
||||
if (re->nsub_ > 0) {
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp* sub = subs[i];
|
||||
if (sub == NULL)
|
||||
continue;
|
||||
if (sub->ref_ == kMaxRef)
|
||||
sub->Decref();
|
||||
else
|
||||
--sub->ref_;
|
||||
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
|
||||
sub->down_ = stack;
|
||||
stack = sub;
|
||||
}
|
||||
}
|
||||
if (re->nsub_ > 1)
|
||||
delete[] subs;
|
||||
re->nsub_ = 0;
|
||||
}
|
||||
delete re;
|
||||
}
|
||||
}
|
||||
|
||||
void Regexp::AddRuneToString(Rune r) {
|
||||
DCHECK(op_ == kRegexpLiteralString);
|
||||
if (nrunes_ == 0) {
|
||||
// start with 8
|
||||
runes_ = new Rune[8];
|
||||
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
|
||||
// double on powers of two
|
||||
Rune *old = runes_;
|
||||
runes_ = new Rune[nrunes_ * 2];
|
||||
for (int i = 0; i < nrunes_; i++)
|
||||
runes_[i] = old[i];
|
||||
delete[] old;
|
||||
}
|
||||
|
||||
runes_[nrunes_++] = r;
|
||||
}
|
||||
|
||||
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
||||
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
|
||||
re->match_id_ = match_id;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpPlus, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpStar, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpQuest, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
||||
ParseFlags flags, bool can_factor) {
|
||||
if (nsub == 1)
|
||||
return sub[0];
|
||||
|
||||
Regexp** subcopy = NULL;
|
||||
if (op == kRegexpAlternate && can_factor) {
|
||||
// Going to edit sub; make a copy so we don't step on caller.
|
||||
subcopy = new Regexp*[nsub];
|
||||
memmove(subcopy, sub, nsub * sizeof sub[0]);
|
||||
sub = subcopy;
|
||||
nsub = FactorAlternation(sub, nsub, flags);
|
||||
if (nsub == 1) {
|
||||
Regexp* re = sub[0];
|
||||
delete[] subcopy;
|
||||
return re;
|
||||
}
|
||||
}
|
||||
|
||||
if (nsub > kMaxNsub) {
|
||||
// Too many subexpressions to fit in a single Regexp.
|
||||
// Make a two-level tree. Two levels gets us to 65535^2.
|
||||
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
|
||||
Regexp* re = new Regexp(op, flags);
|
||||
re->AllocSub(nbigsub);
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < nbigsub - 1; i++)
|
||||
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
|
||||
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
|
||||
nsub - (nbigsub-1)*kMaxNsub, flags,
|
||||
false);
|
||||
delete[] subcopy;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* re = new Regexp(op, flags);
|
||||
re->AllocSub(nsub);
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < nsub; i++)
|
||||
subs[i] = sub[i];
|
||||
|
||||
delete[] subcopy;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
|
||||
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
|
||||
}
|
||||
|
||||
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
|
||||
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
|
||||
}
|
||||
|
||||
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
|
||||
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
|
||||
}
|
||||
|
||||
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
|
||||
Regexp* re = new Regexp(kRegexpCapture, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
re->cap_ = cap;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
|
||||
Regexp* re = new Regexp(kRegexpRepeat, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
re->min_ = min;
|
||||
re->max_ = max;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
|
||||
Regexp* re = new Regexp(kRegexpLiteral, flags);
|
||||
re->rune_ = rune;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
|
||||
if (nrunes <= 0)
|
||||
return new Regexp(kRegexpEmptyMatch, flags);
|
||||
if (nrunes == 1)
|
||||
return NewLiteral(runes[0], flags);
|
||||
Regexp* re = new Regexp(kRegexpLiteralString, flags);
|
||||
for (int i = 0; i < nrunes; i++)
|
||||
re->AddRuneToString(runes[i]);
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
|
||||
Regexp* re = new Regexp(kRegexpCharClass, flags);
|
||||
re->cc_ = cc;
|
||||
return re;
|
||||
}
|
||||
|
||||
// Swaps this and that in place.
|
||||
void Regexp::Swap(Regexp* that) {
|
||||
// Can use memmove because Regexp is just a struct (no vtable).
|
||||
char tmp[sizeof *this];
|
||||
memmove(tmp, this, sizeof tmp);
|
||||
memmove(this, that, sizeof tmp);
|
||||
memmove(that, tmp, sizeof tmp);
|
||||
}
|
||||
|
||||
// Tests equality of all top-level structure but not subregexps.
|
||||
static bool TopEqual(Regexp* a, Regexp* b) {
|
||||
if (a->op() != b->op())
|
||||
return false;
|
||||
|
||||
switch (a->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpBeginText:
|
||||
return true;
|
||||
|
||||
case kRegexpEndText:
|
||||
// The parse flags remember whether it's \z or (?-m:$),
|
||||
// which matters when testing against PCRE.
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
|
||||
|
||||
case kRegexpLiteral:
|
||||
return a->rune() == b->rune() &&
|
||||
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
return a->nrunes() == b->nrunes() &&
|
||||
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
|
||||
memcmp(a->runes(), b->runes(),
|
||||
a->nrunes() * sizeof a->runes()[0]) == 0;
|
||||
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
return a->nsub() == b->nsub();
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
|
||||
|
||||
case kRegexpRepeat:
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
|
||||
a->min() == b->min() &&
|
||||
a->max() == b->max();
|
||||
|
||||
case kRegexpCapture:
|
||||
return a->cap() == b->cap() && a->name() == b->name();
|
||||
|
||||
case kRegexpHaveMatch:
|
||||
return a->match_id() == b->match_id();
|
||||
|
||||
case kRegexpCharClass: {
|
||||
CharClass* acc = a->cc();
|
||||
CharClass* bcc = b->cc();
|
||||
return acc->size() == bcc->size() &&
|
||||
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
|
||||
memcmp(acc->begin(), bcc->begin(),
|
||||
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool Regexp::Equal(Regexp* a, Regexp* b) {
|
||||
if (a == NULL || b == NULL)
|
||||
return a == b;
|
||||
|
||||
if (!TopEqual(a, b))
|
||||
return false;
|
||||
|
||||
// Fast path:
|
||||
// return without allocating vector if there are no subregexps.
|
||||
switch (a->op()) {
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
case kRegexpCapture:
|
||||
break;
|
||||
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
// Committed to doing real work.
|
||||
// The stack (vector) has pairs of regexps waiting to
|
||||
// be compared. The regexps are only equal if
|
||||
// all the pairs end up being equal.
|
||||
vector<Regexp*> stk;
|
||||
|
||||
for (;;) {
|
||||
// Invariant: TopEqual(a, b) == true.
|
||||
Regexp* a2;
|
||||
Regexp* b2;
|
||||
switch (a->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
for (int i = 0; i < a->nsub(); i++) {
|
||||
a2 = a->sub()[i];
|
||||
b2 = b->sub()[i];
|
||||
if (!TopEqual(a2, b2))
|
||||
return false;
|
||||
stk.push_back(a2);
|
||||
stk.push_back(b2);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
case kRegexpCapture:
|
||||
a2 = a->sub()[0];
|
||||
b2 = b->sub()[0];
|
||||
if (!TopEqual(a2, b2))
|
||||
return false;
|
||||
// Really:
|
||||
// stk.push_back(a2);
|
||||
// stk.push_back(b2);
|
||||
// break;
|
||||
// but faster to assign directly and loop.
|
||||
a = a2;
|
||||
b = b2;
|
||||
continue;
|
||||
}
|
||||
|
||||
int n = stk.size();
|
||||
if (n == 0)
|
||||
break;
|
||||
|
||||
a = stk[n-2];
|
||||
b = stk[n-1];
|
||||
stk.resize(n-2);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Keep in sync with enum RegexpStatusCode in regexp.h
|
||||
static const string kErrorStrings[] = {
|
||||
"no error",
|
||||
"unexpected error",
|
||||
"invalid escape sequence",
|
||||
"invalid character class",
|
||||
"invalid character class range",
|
||||
"missing ]",
|
||||
"missing )",
|
||||
"trailing \\",
|
||||
"no argument for repetition operator",
|
||||
"invalid repetition size",
|
||||
"bad repetition operator",
|
||||
"invalid perl operator",
|
||||
"invalid UTF-8",
|
||||
"invalid named capture group",
|
||||
};
|
||||
|
||||
const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
|
||||
if (code < 0 || code >= arraysize(kErrorStrings))
|
||||
code = kRegexpInternalError;
|
||||
return kErrorStrings[code];
|
||||
}
|
||||
|
||||
string RegexpStatus::Text() const {
|
||||
if (error_arg_.empty())
|
||||
return CodeText(code_);
|
||||
string s;
|
||||
s.append(CodeText(code_));
|
||||
s.append(": ");
|
||||
s.append(error_arg_.data(), error_arg_.size());
|
||||
return s;
|
||||
}
|
||||
|
||||
void RegexpStatus::Copy(const RegexpStatus& status) {
|
||||
code_ = status.code_;
|
||||
error_arg_ = status.error_arg_;
|
||||
}
|
||||
|
||||
typedef int Ignored; // Walker<void> doesn't exist
|
||||
|
||||
// Walker subclass to count capturing parens in regexp.
|
||||
class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
NumCapturesWalker() : ncapture_(0) {}
|
||||
int ncapture() { return ncapture_; }
|
||||
|
||||
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
||||
if (re->op() == kRegexpCapture)
|
||||
ncapture_++;
|
||||
return ignored;
|
||||
}
|
||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
int ncapture_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
|
||||
};
|
||||
|
||||
int Regexp::NumCaptures() {
|
||||
NumCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.ncapture();
|
||||
}
|
||||
|
||||
// Walker class to build map of named capture groups and their indices.
|
||||
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
NamedCapturesWalker() : map_(NULL) {}
|
||||
~NamedCapturesWalker() { delete map_; }
|
||||
|
||||
map<string, int>* TakeMap() {
|
||||
map<string, int>* m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new map<string, int>;
|
||||
|
||||
// Record first occurrence of each name.
|
||||
// (The rule is that if you have the same name
|
||||
// multiple times, only the leftmost one counts.)
|
||||
if (map_->find(*re->name()) == map_->end())
|
||||
(*map_)[*re->name()] = re->cap();
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
map<string, int>* map_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
|
||||
};
|
||||
|
||||
map<string, int>* Regexp::NamedCaptures() {
|
||||
NamedCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
}
|
||||
|
||||
// Walker class to build map from capture group indices to their names.
|
||||
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
CaptureNamesWalker() : map_(NULL) {}
|
||||
~CaptureNamesWalker() { delete map_; }
|
||||
|
||||
map<int, string>* TakeMap() {
|
||||
map<int, string>* m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new map<int, string>;
|
||||
|
||||
(*map_)[re->cap()] = *re->name();
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
map<int, string>* map_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
|
||||
};
|
||||
|
||||
map<int, string>* Regexp::CaptureNames() {
|
||||
CaptureNamesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
}
|
||||
|
||||
// Determines whether regexp matches must be anchored
|
||||
// with a fixed string prefix. If so, returns the prefix and
|
||||
// the regexp that remains after the prefix. The prefix might
|
||||
// be ASCII case-insensitive.
|
||||
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
||||
// No need for a walker: the regexp must be of the form
|
||||
// 1. some number of ^ anchors
|
||||
// 2. a literal char or string
|
||||
// 3. the rest
|
||||
prefix->clear();
|
||||
*foldcase = false;
|
||||
*suffix = NULL;
|
||||
if (op_ != kRegexpConcat)
|
||||
return false;
|
||||
|
||||
// Some number of anchors, then a literal or concatenation.
|
||||
int i = 0;
|
||||
Regexp** sub = this->sub();
|
||||
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
|
||||
i++;
|
||||
if (i == 0 || i >= nsub_)
|
||||
return false;
|
||||
|
||||
Regexp* re = sub[i];
|
||||
switch (re->op_) {
|
||||
default:
|
||||
return false;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
// Convert to string in proper encoding.
|
||||
if (re->parse_flags() & Latin1) {
|
||||
prefix->resize(re->nrunes_);
|
||||
for (int j = 0; j < re->nrunes_; j++)
|
||||
(*prefix)[j] = re->runes_[j];
|
||||
} else {
|
||||
// Convert to UTF-8 in place.
|
||||
// Assume worst-case space and then trim.
|
||||
prefix->resize(re->nrunes_ * UTFmax);
|
||||
char *p = &(*prefix)[0];
|
||||
for (int j = 0; j < re->nrunes_; j++) {
|
||||
Rune r = re->runes_[j];
|
||||
if (r < Runeself)
|
||||
*p++ = r;
|
||||
else
|
||||
p += runetochar(p, &r);
|
||||
}
|
||||
prefix->resize(p - &(*prefix)[0]);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
||||
prefix->append(1, re->rune_);
|
||||
} else {
|
||||
char buf[UTFmax];
|
||||
prefix->append(buf, runetochar(buf, &re->rune_));
|
||||
}
|
||||
break;
|
||||
}
|
||||
*foldcase = (sub[i]->parse_flags() & FoldCase);
|
||||
i++;
|
||||
|
||||
// The rest.
|
||||
if (i < nsub_) {
|
||||
for (int j = i; j < nsub_; j++)
|
||||
sub[j]->Incref();
|
||||
re = Concat(sub + i, nsub_ - i, parse_flags());
|
||||
} else {
|
||||
re = new Regexp(kRegexpEmptyMatch, parse_flags());
|
||||
}
|
||||
*suffix = re;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Character class builder is a balanced binary tree (STL set)
|
||||
// containing non-overlapping, non-abutting RuneRanges.
|
||||
// The less-than operator used in the tree treats two
|
||||
// ranges as equal if they overlap at all, so that
|
||||
// lookups for a particular Rune are possible.
|
||||
|
||||
CharClassBuilder::CharClassBuilder() {
|
||||
nrunes_ = 0;
|
||||
upper_ = 0;
|
||||
lower_ = 0;
|
||||
}
|
||||
|
||||
// Add lo-hi to the class; return whether class got bigger.
|
||||
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
||||
if (hi < lo)
|
||||
return false;
|
||||
|
||||
if (lo <= 'z' && hi >= 'A') {
|
||||
// Overlaps some alpha, maybe not all.
|
||||
// Update bitmaps telling which ASCII letters are in the set.
|
||||
Rune lo1 = max<Rune>(lo, 'A');
|
||||
Rune hi1 = min<Rune>(hi, 'Z');
|
||||
if (lo1 <= hi1)
|
||||
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
||||
|
||||
lo1 = max<Rune>(lo, 'a');
|
||||
hi1 = min<Rune>(hi, 'z');
|
||||
if (lo1 <= hi1)
|
||||
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
||||
}
|
||||
|
||||
{ // Check whether lo, hi is already in the class.
|
||||
iterator it = ranges_.find(RuneRange(lo, lo));
|
||||
if (it != end() && it->lo <= lo && hi <= it->hi)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for a range abutting lo on the left.
|
||||
// If it exists, take it out and increase our range.
|
||||
if (lo > 0) {
|
||||
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
|
||||
if (it != end()) {
|
||||
lo = it->lo;
|
||||
if (it->hi > hi)
|
||||
hi = it->hi;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for a range abutting hi on the right.
|
||||
// If it exists, take it out and increase our range.
|
||||
if (hi < Runemax) {
|
||||
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
|
||||
if (it != end()) {
|
||||
hi = it->hi;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for ranges between lo and hi. Take them out.
|
||||
// This is only safe because the set has no overlapping ranges.
|
||||
// We've already removed any ranges abutting lo and hi, so
|
||||
// any that overlap [lo, hi] must be contained within it.
|
||||
for (;;) {
|
||||
iterator it = ranges_.find(RuneRange(lo, hi));
|
||||
if (it == end())
|
||||
break;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
|
||||
// Finally, add [lo, hi].
|
||||
nrunes_ += hi - lo + 1;
|
||||
ranges_.insert(RuneRange(lo, hi));
|
||||
return true;
|
||||
}
|
||||
|
||||
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
|
||||
for (iterator it = cc->begin(); it != cc->end(); ++it)
|
||||
AddRange(it->lo, it->hi);
|
||||
}
|
||||
|
||||
bool CharClassBuilder::Contains(Rune r) {
|
||||
return ranges_.find(RuneRange(r, r)) != end();
|
||||
}
|
||||
|
||||
// Does the character class behave the same on A-Z as on a-z?
|
||||
bool CharClassBuilder::FoldsASCII() {
|
||||
return ((upper_ ^ lower_) & AlphaMask) == 0;
|
||||
}
|
||||
|
||||
CharClassBuilder* CharClassBuilder::Copy() {
|
||||
CharClassBuilder* cc = new CharClassBuilder;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_.insert(RuneRange(it->lo, it->hi));
|
||||
cc->upper_ = upper_;
|
||||
cc->lower_ = lower_;
|
||||
cc->nrunes_ = nrunes_;
|
||||
return cc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void CharClassBuilder::RemoveAbove(Rune r) {
|
||||
if (r >= Runemax)
|
||||
return;
|
||||
|
||||
if (r < 'z') {
|
||||
if (r < 'a')
|
||||
lower_ = 0;
|
||||
else
|
||||
lower_ &= AlphaMask >> ('z' - r);
|
||||
}
|
||||
|
||||
if (r < 'Z') {
|
||||
if (r < 'A')
|
||||
upper_ = 0;
|
||||
else
|
||||
upper_ &= AlphaMask >> ('Z' - r);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
|
||||
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
|
||||
if (it == end())
|
||||
break;
|
||||
RuneRange rr = *it;
|
||||
ranges_.erase(it);
|
||||
nrunes_ -= rr.hi - rr.lo + 1;
|
||||
if (rr.lo <= r) {
|
||||
rr.hi = r;
|
||||
ranges_.insert(rr);
|
||||
nrunes_ += rr.hi - rr.lo + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CharClassBuilder::Negate() {
|
||||
// Build up negation and then copy in.
|
||||
// Could edit ranges in place, but C++ won't let me.
|
||||
vector<RuneRange> v;
|
||||
v.reserve(ranges_.size() + 1);
|
||||
|
||||
// In negation, first range begins at 0, unless
|
||||
// the current class begins at 0.
|
||||
iterator it = begin();
|
||||
if (it == end()) {
|
||||
v.push_back(RuneRange(0, Runemax));
|
||||
} else {
|
||||
int nextlo = 0;
|
||||
if (it->lo == 0) {
|
||||
nextlo = it->hi + 1;
|
||||
++it;
|
||||
}
|
||||
for (; it != end(); ++it) {
|
||||
v.push_back(RuneRange(nextlo, it->lo - 1));
|
||||
nextlo = it->hi + 1;
|
||||
}
|
||||
if (nextlo <= Runemax)
|
||||
v.push_back(RuneRange(nextlo, Runemax));
|
||||
}
|
||||
|
||||
ranges_.clear();
|
||||
for (int i = 0; i < v.size(); i++)
|
||||
ranges_.insert(v[i]);
|
||||
|
||||
upper_ = AlphaMask & ~upper_;
|
||||
lower_ = AlphaMask & ~lower_;
|
||||
nrunes_ = Runemax+1 - nrunes_;
|
||||
}
|
||||
|
||||
// Character class is a sorted list of ranges.
|
||||
// The ranges are allocated in the same block as the header,
|
||||
// necessitating a special allocator and Delete method.
|
||||
|
||||
CharClass* CharClass::New(int maxranges) {
|
||||
CharClass* cc;
|
||||
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
||||
cc = reinterpret_cast<CharClass*>(data);
|
||||
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
||||
cc->nranges_ = 0;
|
||||
cc->folds_ascii_ = false;
|
||||
cc->nrunes_ = 0;
|
||||
return cc;
|
||||
}
|
||||
|
||||
void CharClass::Delete() {
|
||||
if (this == NULL)
|
||||
return;
|
||||
uint8 *data = reinterpret_cast<uint8*>(this);
|
||||
delete[] data;
|
||||
}
|
||||
|
||||
CharClass* CharClass::Negate() {
|
||||
CharClass* cc = CharClass::New(nranges_+1);
|
||||
cc->folds_ascii_ = folds_ascii_;
|
||||
cc->nrunes_ = Runemax + 1 - nrunes_;
|
||||
int n = 0;
|
||||
int nextlo = 0;
|
||||
for (CharClass::iterator it = begin(); it != end(); ++it) {
|
||||
if (it->lo == nextlo) {
|
||||
nextlo = it->hi + 1;
|
||||
} else {
|
||||
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
|
||||
nextlo = it->hi + 1;
|
||||
}
|
||||
}
|
||||
if (nextlo <= Runemax)
|
||||
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
|
||||
cc->nranges_ = n;
|
||||
return cc;
|
||||
}
|
||||
|
||||
bool CharClass::Contains(Rune r) {
|
||||
RuneRange* rr = ranges_;
|
||||
int n = nranges_;
|
||||
while (n > 0) {
|
||||
int m = n/2;
|
||||
if (rr[m].hi < r) {
|
||||
rr += m+1;
|
||||
n -= m+1;
|
||||
} else if (r < rr[m].lo) {
|
||||
n = m;
|
||||
} else { // rr[m].lo <= r && r <= rr[m].hi
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
CharClass* CharClassBuilder::GetCharClass() {
|
||||
CharClass* cc = CharClass::New(ranges_.size());
|
||||
int n = 0;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_[n++] = *it;
|
||||
cc->nranges_ = n;
|
||||
DCHECK_LE(n, ranges_.size());
|
||||
cc->nrunes_ = nrunes_;
|
||||
cc->folds_ascii_ = FoldsASCII();
|
||||
return cc;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,632 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// --- SPONSORED LINK --------------------------------------------------
|
||||
// If you want to use this library for regular expression matching,
|
||||
// you should use re2/re2.h, which provides a class RE2 that
|
||||
// mimics the PCRE interface provided by PCRE's C++ wrappers.
|
||||
// This header describes the low-level interface used to implement RE2
|
||||
// and may change in backwards-incompatible ways from time to time.
|
||||
// In contrast, RE2's interface will not.
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
// Regular expression library: parsing, execution, and manipulation
|
||||
// of regular expressions.
|
||||
//
|
||||
// Any operation that traverses the Regexp structures should be written
|
||||
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
|
||||
// regular expressions such as x++++++++++++++++++++... might cause recursive
|
||||
// traversals to overflow the stack.
|
||||
//
|
||||
// It is the caller's responsibility to provide appropriate mutual exclusion
|
||||
// around manipulation of the regexps. RE2 does this.
|
||||
//
|
||||
// PARSING
|
||||
//
|
||||
// Regexp::Parse parses regular expressions encoded in UTF-8.
|
||||
// The default syntax is POSIX extended regular expressions,
|
||||
// with the following changes:
|
||||
//
|
||||
// 1. Backreferences (optional in POSIX EREs) are not supported.
|
||||
// (Supporting them precludes the use of DFA-based
|
||||
// matching engines.)
|
||||
//
|
||||
// 2. Collating elements and collation classes are not supported.
|
||||
// (No one has needed or wanted them.)
|
||||
//
|
||||
// The exact syntax accepted can be modified by passing flags to
|
||||
// Regexp::Parse. In particular, many of the basic Perl additions
|
||||
// are available. The flags are documented below (search for LikePerl).
|
||||
//
|
||||
// If parsed with the flag Regexp::Latin1, both the regular expression
|
||||
// and the input to the matching routines are assumed to be encoded in
|
||||
// Latin-1, not UTF-8.
|
||||
//
|
||||
// EXECUTION
|
||||
//
|
||||
// Once Regexp has parsed a regular expression, it provides methods
|
||||
// to search text using that regular expression. These methods are
|
||||
// implemented via calling out to other regular expression libraries.
|
||||
// (Let's call them the sublibraries.)
|
||||
//
|
||||
// To call a sublibrary, Regexp does not simply prepare a
|
||||
// string version of the regular expression and hand it to the
|
||||
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
|
||||
// corresponding internal representation used by the sublibrary.
|
||||
// This has the drawback of needing to know the internal representation
|
||||
// used by the sublibrary, but it has two important benefits:
|
||||
//
|
||||
// 1. The syntax and meaning of regular expressions is guaranteed
|
||||
// to be that used by Regexp's parser, not the syntax expected
|
||||
// by the sublibrary. Regexp might accept a restricted or
|
||||
// expanded syntax for regular expressions as compared with
|
||||
// the sublibrary. As long as Regexp can translate from its
|
||||
// internal form into the sublibrary's, clients need not know
|
||||
// exactly which sublibrary they are using.
|
||||
//
|
||||
// 2. The sublibrary parsers are bypassed. For whatever reason,
|
||||
// sublibrary regular expression parsers often have security
|
||||
// problems. For example, plan9grep's regular expression parser
|
||||
// has a buffer overflow in its handling of large character
|
||||
// classes, and PCRE's parser has had buffer overflow problems
|
||||
// in the past. Security-team requires sandboxing of sublibrary
|
||||
// regular expression parsers. Avoiding the sublibrary parsers
|
||||
// avoids the sandbox.
|
||||
//
|
||||
// The execution methods we use now are provided by the compiled form,
|
||||
// Prog, described in prog.h
|
||||
//
|
||||
// MANIPULATION
|
||||
//
|
||||
// Unlike other regular expression libraries, Regexp makes its parsed
|
||||
// form accessible to clients, so that client code can analyze the
|
||||
// parsed regular expressions.
|
||||
|
||||
#ifndef RE2_REGEXP_H__
|
||||
#define RE2_REGEXP_H__
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
|
||||
enum RegexpOp {
|
||||
// Matches no strings.
|
||||
kRegexpNoMatch = 1,
|
||||
|
||||
// Matches empty string.
|
||||
kRegexpEmptyMatch,
|
||||
|
||||
// Matches rune_.
|
||||
kRegexpLiteral,
|
||||
|
||||
// Matches runes_.
|
||||
kRegexpLiteralString,
|
||||
|
||||
// Matches concatenation of sub_[0..nsub-1].
|
||||
kRegexpConcat,
|
||||
// Matches union of sub_[0..nsub-1].
|
||||
kRegexpAlternate,
|
||||
|
||||
// Matches sub_[0] zero or more times.
|
||||
kRegexpStar,
|
||||
// Matches sub_[0] one or more times.
|
||||
kRegexpPlus,
|
||||
// Matches sub_[0] zero or one times.
|
||||
kRegexpQuest,
|
||||
|
||||
// Matches sub_[0] at least min_ times, at most max_ times.
|
||||
// max_ == -1 means no upper limit.
|
||||
kRegexpRepeat,
|
||||
|
||||
// Parenthesized (capturing) subexpression. Index is cap_.
|
||||
// Optionally, capturing name is name_.
|
||||
kRegexpCapture,
|
||||
|
||||
// Matches any character.
|
||||
kRegexpAnyChar,
|
||||
|
||||
// Matches any byte [sic].
|
||||
kRegexpAnyByte,
|
||||
|
||||
// Matches empty string at beginning of line.
|
||||
kRegexpBeginLine,
|
||||
// Matches empty string at end of line.
|
||||
kRegexpEndLine,
|
||||
|
||||
// Matches word boundary "\b".
|
||||
kRegexpWordBoundary,
|
||||
// Matches not-a-word boundary "\B".
|
||||
kRegexpNoWordBoundary,
|
||||
|
||||
// Matches empty string at beginning of text.
|
||||
kRegexpBeginText,
|
||||
// Matches empty string at end of text.
|
||||
kRegexpEndText,
|
||||
|
||||
// Matches character class given by cc_.
|
||||
kRegexpCharClass,
|
||||
|
||||
// Forces match of entire expression right now,
|
||||
// with match ID match_id_ (used by RE2::Set).
|
||||
kRegexpHaveMatch,
|
||||
|
||||
kMaxRegexpOp = kRegexpHaveMatch,
|
||||
};
|
||||
|
||||
// Keep in sync with string list in regexp.cc
|
||||
enum RegexpStatusCode {
|
||||
// No error
|
||||
kRegexpSuccess = 0,
|
||||
|
||||
// Unexpected error
|
||||
kRegexpInternalError,
|
||||
|
||||
// Parse errors
|
||||
kRegexpBadEscape, // bad escape sequence
|
||||
kRegexpBadCharClass, // bad character class
|
||||
kRegexpBadCharRange, // bad character class range
|
||||
kRegexpMissingBracket, // missing closing ]
|
||||
kRegexpMissingParen, // missing closing )
|
||||
kRegexpTrailingBackslash, // at end of regexp
|
||||
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
|
||||
kRegexpRepeatSize, // bad repetition argument
|
||||
kRegexpRepeatOp, // bad repetition operator
|
||||
kRegexpBadPerlOp, // bad perl operator
|
||||
kRegexpBadUTF8, // invalid UTF-8 in regexp
|
||||
kRegexpBadNamedCapture, // bad named capture
|
||||
};
|
||||
|
||||
// Error status for certain operations.
|
||||
class RegexpStatus {
|
||||
public:
|
||||
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
||||
~RegexpStatus() { delete tmp_; }
|
||||
|
||||
void set_code(enum RegexpStatusCode code) { code_ = code; }
|
||||
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
||||
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
||||
enum RegexpStatusCode code() const { return code_; }
|
||||
const StringPiece& error_arg() const { return error_arg_; }
|
||||
bool ok() const { return code() == kRegexpSuccess; }
|
||||
|
||||
// Copies state from status.
|
||||
void Copy(const RegexpStatus& status);
|
||||
|
||||
// Returns text equivalent of code, e.g.:
|
||||
// "Bad character class"
|
||||
static const string& CodeText(enum RegexpStatusCode code);
|
||||
|
||||
// Returns text describing error, e.g.:
|
||||
// "Bad character class: [z-a]"
|
||||
string Text() const;
|
||||
|
||||
private:
|
||||
enum RegexpStatusCode code_; // Kind of error
|
||||
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
||||
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
|
||||
};
|
||||
|
||||
// Walker to implement Simplify.
|
||||
class SimplifyWalker;
|
||||
|
||||
// Compiled form; see prog.h
|
||||
class Prog;
|
||||
|
||||
struct RuneRange {
|
||||
RuneRange() : lo(0), hi(0) { }
|
||||
RuneRange(int l, int h) : lo(l), hi(h) { }
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
};
|
||||
|
||||
// Less-than on RuneRanges treats a == b if they overlap at all.
|
||||
// This lets us look in a set to find the range covering a particular Rune.
|
||||
struct RuneRangeLess {
|
||||
bool operator()(const RuneRange& a, const RuneRange& b) const {
|
||||
return a.hi < b.lo;
|
||||
}
|
||||
};
|
||||
|
||||
class CharClassBuilder;
|
||||
|
||||
class CharClass {
|
||||
public:
|
||||
void Delete();
|
||||
|
||||
typedef RuneRange* iterator;
|
||||
iterator begin() { return ranges_; }
|
||||
iterator end() { return ranges_ + nranges_; }
|
||||
|
||||
int size() { return nrunes_; }
|
||||
bool empty() { return nrunes_ == 0; }
|
||||
bool full() { return nrunes_ == Runemax+1; }
|
||||
bool FoldsASCII() { return folds_ascii_; }
|
||||
|
||||
bool Contains(Rune r);
|
||||
CharClass* Negate();
|
||||
|
||||
private:
|
||||
CharClass(); // not implemented
|
||||
~CharClass(); // not implemented
|
||||
static CharClass* New(int maxranges);
|
||||
|
||||
friend class CharClassBuilder;
|
||||
|
||||
bool folds_ascii_;
|
||||
int nrunes_;
|
||||
RuneRange *ranges_;
|
||||
int nranges_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
|
||||
};
|
||||
|
||||
class Regexp {
|
||||
public:
|
||||
|
||||
// Flags for parsing. Can be ORed together.
|
||||
enum ParseFlags {
|
||||
NoParseFlags = 0,
|
||||
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
||||
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
||||
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
||||
// and [[:space:]] to match newline.
|
||||
DotNL = 1<<3, // Allow . to match newline.
|
||||
MatchNL = ClassNL | DotNL,
|
||||
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
||||
// end of text, not around embedded newlines.
|
||||
// (Perl's default)
|
||||
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
||||
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
||||
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
||||
PerlB = 1<<8, // Allow Perl's \b and \B.
|
||||
PerlX = 1<<9, // Perl extensions:
|
||||
// non-capturing parens - (?: )
|
||||
// non-greedy operators - *? +? ?? {}?
|
||||
// flag edits - (?i) (?-i) (?i: )
|
||||
// i - FoldCase
|
||||
// m - !OneLine
|
||||
// s - DotNL
|
||||
// U - NonGreedy
|
||||
// line ends: \A \z
|
||||
// \Q and \E to disable/enable metacharacters
|
||||
// (?P<name>expr) for named captures
|
||||
// \C to match any single byte
|
||||
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
||||
// and \P{Han} for its negation.
|
||||
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
||||
// it explicitly.
|
||||
|
||||
// As close to Perl as we can get.
|
||||
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
||||
UnicodeGroups,
|
||||
|
||||
// Internal use only.
|
||||
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
|
||||
};
|
||||
|
||||
// Get. No set, Regexps are logically immutable once created.
|
||||
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
||||
int nsub() { return nsub_; }
|
||||
bool simple() { return simple_; }
|
||||
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
||||
int Ref(); // For testing.
|
||||
|
||||
Regexp** sub() {
|
||||
if(nsub_ <= 1)
|
||||
return &subone_;
|
||||
else
|
||||
return submany_;
|
||||
}
|
||||
|
||||
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
|
||||
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
|
||||
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
|
||||
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
|
||||
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
|
||||
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
|
||||
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
|
||||
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
|
||||
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp* Incref();
|
||||
|
||||
// Decrements reference count and deletes this object if count reaches 0.
|
||||
void Decref();
|
||||
|
||||
// Parses string s to produce regular expression, returned.
|
||||
// Caller must release return value with re->Decref().
|
||||
// On failure, sets *status (if status != NULL) and returns NULL.
|
||||
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Returns a _new_ simplified version of the current regexp.
|
||||
// Does not edit the current regexp.
|
||||
// Caller must release return value with re->Decref().
|
||||
// Simplified means that counted repetition has been rewritten
|
||||
// into simpler terms and all Perl/POSIX features have been
|
||||
// removed. The result will capture exactly the same
|
||||
// subexpressions the original did, unless formatted with ToString.
|
||||
Regexp* Simplify();
|
||||
friend class SimplifyWalker;
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
// string representation of the simplified form. Returns true on success.
|
||||
// Returns false and sets *status (if status != NULL) on parse error.
|
||||
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
||||
string* dst,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Returns the number of capturing groups in the regexp.
|
||||
int NumCaptures();
|
||||
friend class NumCapturesWalker;
|
||||
|
||||
// Returns a map from names to capturing group indices,
|
||||
// or NULL if the regexp contains no named capture groups.
|
||||
// The caller is responsible for deleting the map.
|
||||
map<string, int>* NamedCaptures();
|
||||
|
||||
// Returns a map from capturing group indices to capturing group
|
||||
// names or NULL if the regexp contains no named capture groups. The
|
||||
// caller is responsible for deleting the map.
|
||||
map<int, string>* CaptureNames();
|
||||
|
||||
// Returns a string representation of the current regexp,
|
||||
// using as few parentheses as possible.
|
||||
string ToString();
|
||||
|
||||
// Convenience functions. They consume the passed reference,
|
||||
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
|
||||
// They do not consume allocated arrays like subs or runes.
|
||||
static Regexp* Plus(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Star(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Quest(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
|
||||
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
|
||||
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
|
||||
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
|
||||
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
|
||||
static Regexp* HaveMatch(int match_id, ParseFlags flags);
|
||||
|
||||
// Like Alternate but does not factor out common prefixes.
|
||||
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
|
||||
// Debugging function. Returns string format for regexp
|
||||
// that makes structure clear. Does NOT use regexp syntax.
|
||||
string Dump();
|
||||
|
||||
// Helper traversal class, defined fully in walker-inl.h.
|
||||
template<typename T> class Walker;
|
||||
|
||||
// Compile to Prog. See prog.h
|
||||
// Reverse prog expects to be run over text backward.
|
||||
// Construction and execution of prog will
|
||||
// stay within approximately max_mem bytes of memory.
|
||||
// If max_mem <= 0, a reasonable default is used.
|
||||
Prog* CompileToProg(int64 max_mem);
|
||||
Prog* CompileToReverseProg(int64 max_mem);
|
||||
|
||||
// Whether to expect this library to find exactly the same answer as PCRE
|
||||
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
||||
// obscure cases behave differently. Technically this is more a property
|
||||
// of the Prog than the Regexp, but the computation is much easier to do
|
||||
// on the Regexp. See mimics_pcre.cc for the exact conditions.
|
||||
bool MimicsPCRE();
|
||||
|
||||
// Benchmarking function.
|
||||
void NullWalk();
|
||||
|
||||
// Whether every match of this regexp must be anchored and
|
||||
// begin with a non-empty fixed string (perhaps after ASCII
|
||||
// case-folding). If so, returns the prefix and the sub-regexp that
|
||||
// follows it.
|
||||
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
|
||||
|
||||
private:
|
||||
// Constructor allocates vectors as appropriate for operator.
|
||||
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
|
||||
|
||||
// Use Decref() instead of delete to release Regexps.
|
||||
// This is private to catch deletes at compile time.
|
||||
~Regexp();
|
||||
void Destroy();
|
||||
bool QuickDestroy();
|
||||
|
||||
// Helpers for Parse. Listed here so they can edit Regexps.
|
||||
class ParseState;
|
||||
friend class ParseState;
|
||||
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Helper for testing [sic].
|
||||
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
|
||||
|
||||
// Computes whether Regexp is already simple.
|
||||
bool ComputeSimple();
|
||||
|
||||
// Constructor that generates a concatenation or alternation,
|
||||
// enforcing the limit on the number of subexpressions for
|
||||
// a particular Regexp.
|
||||
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
|
||||
ParseFlags flags, bool can_factor);
|
||||
|
||||
// Returns the leading string that re starts with.
|
||||
// The returned Rune* points into a piece of re,
|
||||
// so it must not be used after the caller calls re->Decref().
|
||||
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
|
||||
|
||||
// Removes the first n leading runes from the beginning of re.
|
||||
// Edits re in place.
|
||||
static void RemoveLeadingString(Regexp* re, int n);
|
||||
|
||||
// Returns the leading regexp in re's top-level concatenation.
|
||||
// The returned Regexp* points at re or a sub-expression of re,
|
||||
// so it must not be used after the caller calls re->Decref().
|
||||
static Regexp* LeadingRegexp(Regexp* re);
|
||||
|
||||
// Removes LeadingRegexp(re) from re and returns the remainder.
|
||||
// Might edit re in place.
|
||||
static Regexp* RemoveLeadingRegexp(Regexp* re);
|
||||
|
||||
// Simplifies an alternation of literal strings by factoring out
|
||||
// common prefixes.
|
||||
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
||||
static int FactorAlternationRecursive(Regexp** sub, int nsub,
|
||||
ParseFlags flags, int maxdepth);
|
||||
|
||||
// Is a == b? Only efficient on regexps that have not been through
|
||||
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
||||
// take a long time. Do not call on such regexps, hence private.
|
||||
static bool Equal(Regexp* a, Regexp* b);
|
||||
|
||||
// Allocate space for n sub-regexps.
|
||||
void AllocSub(int n) {
|
||||
if (n < 0 || static_cast<uint16>(n) != n)
|
||||
LOG(FATAL) << "Cannot AllocSub " << n;
|
||||
if (n > 1)
|
||||
submany_ = new Regexp*[n];
|
||||
nsub_ = n;
|
||||
}
|
||||
|
||||
// Add Rune to LiteralString
|
||||
void AddRuneToString(Rune r);
|
||||
|
||||
// Swaps this with that, in place.
|
||||
void Swap(Regexp *that);
|
||||
|
||||
// Operator. See description of operators above.
|
||||
// uint8 instead of RegexpOp to control space usage.
|
||||
uint8 op_;
|
||||
|
||||
// Is this regexp structure already simple
|
||||
// (has it been returned by Simplify)?
|
||||
// uint8 instead of bool to control space usage.
|
||||
uint8 simple_;
|
||||
|
||||
// Flags saved from parsing and used during execution.
|
||||
// (Only FoldCase is used.)
|
||||
// uint16 instead of ParseFlags to control space usage.
|
||||
uint16 parse_flags_;
|
||||
|
||||
// Reference count. Exists so that SimplifyRegexp can build
|
||||
// regexp structures that are dags rather than trees to avoid
|
||||
// exponential blowup in space requirements.
|
||||
// uint16 to control space usage.
|
||||
// The standard regexp routines will never generate a
|
||||
// ref greater than the maximum repeat count (100),
|
||||
// but even so, Incref and Decref consult an overflow map
|
||||
// when ref_ reaches kMaxRef.
|
||||
uint16 ref_;
|
||||
static const uint16 kMaxRef = 0xffff;
|
||||
|
||||
// Subexpressions.
|
||||
// uint16 to control space usage.
|
||||
// Concat and Alternate handle larger numbers of subexpressions
|
||||
// by building concatenation or alternation trees.
|
||||
// Other routines should call Concat or Alternate instead of
|
||||
// filling in sub() by hand.
|
||||
uint16 nsub_;
|
||||
static const uint16 kMaxNsub = 0xffff;
|
||||
union {
|
||||
Regexp** submany_; // if nsub_ > 1
|
||||
Regexp* subone_; // if nsub_ == 1
|
||||
};
|
||||
|
||||
// Extra space for parse and teardown stacks.
|
||||
Regexp* down_;
|
||||
|
||||
// Arguments to operator. See description of operators above.
|
||||
union {
|
||||
struct { // Repeat
|
||||
int max_;
|
||||
int min_;
|
||||
};
|
||||
struct { // Capture
|
||||
int cap_;
|
||||
string* name_;
|
||||
};
|
||||
struct { // LiteralString
|
||||
int nrunes_;
|
||||
Rune* runes_;
|
||||
};
|
||||
struct { // CharClass
|
||||
// These two could be in separate union members,
|
||||
// but it wouldn't save any space (there are other two-word structs)
|
||||
// and keeping them separate avoids confusion during parsing.
|
||||
CharClass* cc_;
|
||||
CharClassBuilder* ccb_;
|
||||
};
|
||||
Rune rune_; // Literal
|
||||
int match_id_; // HaveMatch
|
||||
void *the_union_[2]; // as big as any other element, for memset
|
||||
};
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
|
||||
};
|
||||
|
||||
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
||||
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
|
||||
|
||||
class CharClassBuilder {
|
||||
public:
|
||||
CharClassBuilder();
|
||||
|
||||
typedef RuneRangeSet::iterator iterator;
|
||||
iterator begin() { return ranges_.begin(); }
|
||||
iterator end() { return ranges_.end(); }
|
||||
|
||||
int size() { return nrunes_; }
|
||||
bool empty() { return nrunes_ == 0; }
|
||||
bool full() { return nrunes_ == Runemax+1; }
|
||||
|
||||
bool Contains(Rune r);
|
||||
bool FoldsASCII();
|
||||
bool AddRange(Rune lo, Rune hi); // returns whether class changed
|
||||
CharClassBuilder* Copy();
|
||||
void AddCharClass(CharClassBuilder* cc);
|
||||
void Negate();
|
||||
void RemoveAbove(Rune r);
|
||||
CharClass* GetCharClass();
|
||||
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
||||
|
||||
private:
|
||||
static const uint32 AlphaMask = (1<<26) - 1;
|
||||
uint32 upper_; // bitmap of A-Z
|
||||
uint32 lower_; // bitmap of a-z
|
||||
int nrunes_;
|
||||
RuneRangeSet ranges_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
|
||||
};
|
||||
|
||||
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
|
||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_REGEXP_H__
|
|
@ -0,0 +1,113 @@
|
|||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/set.h"
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
using namespace re2;
|
||||
|
||||
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
||||
options_.Copy(options);
|
||||
anchor_ = anchor;
|
||||
prog_ = NULL;
|
||||
compiled_ = false;
|
||||
}
|
||||
|
||||
RE2::Set::~Set() {
|
||||
for (int i = 0; i < re_.size(); i++)
|
||||
re_[i]->Decref();
|
||||
delete prog_;
|
||||
}
|
||||
|
||||
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Add after Compile";
|
||||
return -1;
|
||||
}
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
|
||||
RegexpStatus status;
|
||||
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
||||
if (re == NULL) {
|
||||
if (error != NULL)
|
||||
*error = status.Text();
|
||||
if (options_.log_errors())
|
||||
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Concatenate with match index and push on vector.
|
||||
int n = re_.size();
|
||||
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
||||
if (re->op() == kRegexpConcat) {
|
||||
int nsub = re->nsub();
|
||||
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
|
||||
for (int i = 0; i < nsub; i++)
|
||||
sub[i] = re->sub()[i]->Incref();
|
||||
sub[nsub] = m;
|
||||
re->Decref();
|
||||
re = re2::Regexp::Concat(sub, nsub + 1, pf);
|
||||
delete[] sub;
|
||||
} else {
|
||||
re2::Regexp* sub[2];
|
||||
sub[0] = re;
|
||||
sub[1] = m;
|
||||
re = re2::Regexp::Concat(sub, 2, pf);
|
||||
}
|
||||
re_.push_back(re);
|
||||
return n;
|
||||
}
|
||||
|
||||
bool RE2::Set::Compile() {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Compile multiple times";
|
||||
return false;
|
||||
}
|
||||
compiled_ = true;
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
|
||||
re_.size(), pf);
|
||||
re_.clear();
|
||||
re2::Regexp* sre = re->Simplify();
|
||||
re->Decref();
|
||||
re = sre;
|
||||
if (re == NULL) {
|
||||
if (options_.log_errors())
|
||||
LOG(ERROR) << "Error simplifying during Compile.";
|
||||
return false;
|
||||
}
|
||||
|
||||
prog_ = Prog::CompileSet(options_, anchor_, re);
|
||||
return prog_ != NULL;
|
||||
}
|
||||
|
||||
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Match without Compile";
|
||||
return false;
|
||||
}
|
||||
v->clear();
|
||||
bool failed;
|
||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
|
||||
Prog::kManyMatch, NULL, &failed, v);
|
||||
if (failed)
|
||||
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
|
||||
|
||||
if (ret == false)
|
||||
return false;
|
||||
if (v->size() == 0) {
|
||||
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_SET_H
|
||||
#define RE2_SET_H
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::vector;
|
||||
|
||||
// An RE2::Set represents a collection of regexps that can
|
||||
// be searched for simultaneously.
|
||||
class RE2::Set {
|
||||
public:
|
||||
Set(const RE2::Options& options, RE2::Anchor anchor);
|
||||
~Set();
|
||||
|
||||
// Add adds regexp pattern to the set, interpreted using the RE2 options.
|
||||
// (The RE2 constructor's default options parameter is RE2::UTF8.)
|
||||
// Add returns the regexp index that will be used to identify
|
||||
// it in the result of Match, or -1 if the regexp cannot be parsed.
|
||||
// Indices are assigned in sequential order starting from 0.
|
||||
// Error returns do not increment the index.
|
||||
// If an error occurs and error != NULL, *error will hold an error message.
|
||||
int Add(const StringPiece& pattern, string* error);
|
||||
|
||||
// Compile prepares the Set for matching.
|
||||
// Add must not be called again after Compile.
|
||||
// Compile must be called before FullMatch or PartialMatch.
|
||||
// Compile may return false if it runs out of memory.
|
||||
bool Compile();
|
||||
|
||||
// Match returns true if text matches any of the regexps in the set.
|
||||
// If so, it fills v with the indices of the matching regexps.
|
||||
bool Match(const StringPiece& text, vector<int>* v) const;
|
||||
|
||||
private:
|
||||
RE2::Options options_;
|
||||
RE2::Anchor anchor_;
|
||||
vector<re2::Regexp*> re_;
|
||||
re2::Prog* prog_;
|
||||
bool compiled_;
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(Set);
|
||||
Set(const Set&);
|
||||
void operator=(const Set&);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_SET_H
|
|
@ -0,0 +1,393 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Rewrite POSIX and other features in re
|
||||
// to use simple extended regular expression features.
|
||||
// Also sort and simplify character classes.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
// string representation of the simplified form. Returns true on success.
|
||||
// Returns false and sets *error (if error != NULL) on error.
|
||||
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
||||
string* dst,
|
||||
RegexpStatus* status) {
|
||||
Regexp* re = Parse(src, flags, status);
|
||||
if (re == NULL)
|
||||
return false;
|
||||
Regexp* sre = re->Simplify();
|
||||
re->Decref();
|
||||
if (sre == NULL) {
|
||||
// Should not happen, since Simplify never fails.
|
||||
LOG(ERROR) << "Simplify failed on " << src;
|
||||
if (status) {
|
||||
status->set_code(kRegexpInternalError);
|
||||
status->set_error_arg(src);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
*dst = sre->ToString();
|
||||
sre->Decref();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assuming the simple_ flags on the children are accurate,
|
||||
// is this Regexp* simple?
|
||||
bool Regexp::ComputeSimple() {
|
||||
Regexp** subs;
|
||||
switch (op_) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpLiteralString:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpEndText:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpHaveMatch:
|
||||
return true;
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate:
|
||||
// These are simple as long as the subpieces are simple.
|
||||
subs = sub();
|
||||
for (int i = 0; i < nsub_; i++)
|
||||
if (!subs[i]->simple_)
|
||||
return false;
|
||||
return true;
|
||||
case kRegexpCharClass:
|
||||
// Simple as long as the char class is not empty, not full.
|
||||
if (ccb_ != NULL)
|
||||
return !ccb_->empty() && !ccb_->full();
|
||||
return !cc_->empty() && !cc_->full();
|
||||
case kRegexpCapture:
|
||||
subs = sub();
|
||||
return subs[0]->simple_;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
subs = sub();
|
||||
if (!subs[0]->simple_)
|
||||
return false;
|
||||
switch (subs[0]->op_) {
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpNoMatch:
|
||||
return false;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
case kRegexpRepeat:
|
||||
return false;
|
||||
}
|
||||
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// The simplify walk is purely post-recursive: given the simplified children,
|
||||
// PostVisit creates the simplified result.
|
||||
// The child_args are simplified Regexp*s.
|
||||
class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
SimplifyWalker() {}
|
||||
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
||||
virtual Regexp* PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
|
||||
private:
|
||||
// These functions are declared inside SimplifyWalker so that
|
||||
// they can edit the private fields of the Regexps they construct.
|
||||
|
||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
|
||||
|
||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
|
||||
Regexp::ParseFlags parse_flags);
|
||||
|
||||
// Simplifies a character class by expanding any named classes
|
||||
// into rune ranges. Does not edit re. Does not consume ref to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyCharClass(Regexp* re);
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
|
||||
};
|
||||
|
||||
// Simplifies a regular expression, returning a new regexp.
|
||||
// The new regexp uses traditional Unix egrep features only,
|
||||
// plus the Perl (?:) non-capturing parentheses.
|
||||
// Otherwise, no POSIX or Perl additions. The new regexp
|
||||
// captures exactly the same subexpressions (with the same indices)
|
||||
// as the original.
|
||||
// Does not edit current object.
|
||||
// Caller must Decref() return value when done with it.
|
||||
|
||||
Regexp* Regexp::Simplify() {
|
||||
if (simple_)
|
||||
return Incref();
|
||||
SimplifyWalker w;
|
||||
return w.Walk(this, NULL);
|
||||
}
|
||||
|
||||
#define Simplify DontCallSimplify // Avoid accidental recursion
|
||||
|
||||
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
// This should never be called, since we use Walk and not
|
||||
// WalkExponential.
|
||||
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
||||
if (re->simple_) {
|
||||
*stop = true;
|
||||
return re->Incref();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args,
|
||||
int nchild_args) {
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpLiteralString:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpEndText:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpHaveMatch:
|
||||
// All these are always simple.
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate: {
|
||||
// These are simple as long as the subpieces are simple.
|
||||
// Two passes to avoid allocation in the common case.
|
||||
bool changed = false;
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp* sub = subs[i];
|
||||
Regexp* newsub = child_args[i];
|
||||
if (newsub != sub) {
|
||||
changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changed) {
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp* newsub = child_args[i];
|
||||
newsub->Decref();
|
||||
}
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub_);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i <re->nsub_; i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpCapture: {
|
||||
Regexp* newsub = child_args[0];
|
||||
if (newsub == re->sub()[0]) {
|
||||
newsub->Decref();
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->cap_ = re->cap_;
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest: {
|
||||
Regexp* newsub = child_args[0];
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if (newsub->op() == kRegexpEmptyMatch)
|
||||
return newsub;
|
||||
|
||||
// These are simple as long as the subpiece is simple.
|
||||
if (newsub == re->sub()[0]) {
|
||||
newsub->Decref();
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
// These are also idempotent if flags are constant.
|
||||
if (re->op() == newsub->op() &&
|
||||
re->parse_flags() == newsub->parse_flags())
|
||||
return newsub;
|
||||
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpRepeat: {
|
||||
Regexp* newsub = child_args[0];
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if (newsub->op() == kRegexpEmptyMatch)
|
||||
return newsub;
|
||||
|
||||
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
|
||||
re->parse_flags());
|
||||
newsub->Decref();
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpCharClass: {
|
||||
Regexp* nre = SimplifyCharClass(re);
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(ERROR) << "Simplify case not handled: " << re->op();
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
||||
// Returns a new Regexp, handing the ref to the caller.
|
||||
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
|
||||
Regexp::ParseFlags parse_flags) {
|
||||
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
|
||||
re->AllocSub(2);
|
||||
Regexp** subs = re->sub();
|
||||
subs[0] = re1;
|
||||
subs[1] = re2;
|
||||
return re;
|
||||
}
|
||||
|
||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
// The result will *not* necessarily have the right capturing parens
|
||||
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
||||
// but in the Regexp* representation, both (x) are marked as $1.
|
||||
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
||||
Regexp::ParseFlags f) {
|
||||
// x{n,} means at least n matches of x.
|
||||
if (max == -1) {
|
||||
// Special case: x{0,} is x*
|
||||
if (min == 0)
|
||||
return Regexp::Star(re->Incref(), f);
|
||||
|
||||
// Special case: x{1,} is x+
|
||||
if (min == 1)
|
||||
return Regexp::Plus(re->Incref(), f);
|
||||
|
||||
// General case: x{4,} is xxxx+
|
||||
Regexp* nre = new Regexp(kRegexpConcat, f);
|
||||
nre->AllocSub(min);
|
||||
VLOG(1) << "Simplify " << min;
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < min-1; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
||||
return nre;
|
||||
}
|
||||
|
||||
// Special case: (x){0} matches only empty string.
|
||||
if (min == 0 && max == 0)
|
||||
return new Regexp(kRegexpEmptyMatch, f);
|
||||
|
||||
// Special case: x{1} is just x.
|
||||
if (min == 1 && max == 1)
|
||||
return re->Incref();
|
||||
|
||||
// General case: x{n,m} means n copies of x and m copies of x?.
|
||||
// The machine will do less work if we nest the final m copies,
|
||||
// so that x{2,5} = xx(x(x(x)?)?)?
|
||||
|
||||
// Build leading prefix: xx. Capturing only on the last one.
|
||||
Regexp* nre = NULL;
|
||||
if (min > 0) {
|
||||
nre = new Regexp(kRegexpConcat, f);
|
||||
nre->AllocSub(min);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < min; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
}
|
||||
|
||||
// Build and attach suffix: (x(x(x)?)?)?
|
||||
if (max > min) {
|
||||
Regexp* suf = Regexp::Quest(re->Incref(), f);
|
||||
for (int i = min+1; i < max; i++)
|
||||
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
||||
if (nre == NULL)
|
||||
nre = suf;
|
||||
else
|
||||
nre = Concat2(nre, suf, f);
|
||||
}
|
||||
|
||||
if (nre == NULL) {
|
||||
// Some degenerate case, like min > max, or min < max < 0.
|
||||
// This shouldn't happen, because the parser rejects such regexps.
|
||||
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
|
||||
return new Regexp(kRegexpNoMatch, f);
|
||||
}
|
||||
|
||||
return nre;
|
||||
}
|
||||
|
||||
// Simplifies a character class.
|
||||
// Caller must Decref return value when done with it.
|
||||
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
|
||||
CharClass* cc = re->cc();
|
||||
|
||||
// Special cases
|
||||
if (cc->empty())
|
||||
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
||||
if (cc->full())
|
||||
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
||||
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,182 @@
|
|||
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// A string-like object that points to a sized piece of memory.
|
||||
//
|
||||
// Functions or methods may use const StringPiece& parameters to accept either
|
||||
// a "const char*" or a "string" value that will be implicitly converted to
|
||||
// a StringPiece. The implicit conversion means that it is often appropriate
|
||||
// to include this .h file in other files rather than forward-declaring
|
||||
// StringPiece as would be appropriate for most other Google classes.
|
||||
//
|
||||
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
||||
// conversions from "const char*" to "string" and back again.
|
||||
//
|
||||
//
|
||||
// Arghh! I wish C++ literals were "string".
|
||||
|
||||
#ifndef STRINGS_STRINGPIECE_H__
|
||||
#define STRINGS_STRINGPIECE_H__
|
||||
|
||||
#include <string.h>
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <string>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class StringPiece {
|
||||
private:
|
||||
const char* ptr_;
|
||||
int length_;
|
||||
|
||||
public:
|
||||
// We provide non-explicit singleton constructors so users can pass
|
||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||
// expected.
|
||||
StringPiece() : ptr_(NULL), length_(0) { }
|
||||
StringPiece(const char* str)
|
||||
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
||||
StringPiece(const std::string& str)
|
||||
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
||||
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
||||
|
||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||
// returned buffer may or may not be null terminated. Therefore it is
|
||||
// typically a mistake to pass data() to a routine that expects a NUL
|
||||
// terminated string.
|
||||
const char* data() const { return ptr_; }
|
||||
int size() const { return length_; }
|
||||
int length() const { return length_; }
|
||||
bool empty() const { return length_ == 0; }
|
||||
|
||||
void clear() { ptr_ = NULL; length_ = 0; }
|
||||
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
||||
void set(const char* str) {
|
||||
ptr_ = str;
|
||||
if (str != NULL)
|
||||
length_ = static_cast<int>(strlen(str));
|
||||
else
|
||||
length_ = 0;
|
||||
}
|
||||
void set(const void* data, int len) {
|
||||
ptr_ = reinterpret_cast<const char*>(data);
|
||||
length_ = len;
|
||||
}
|
||||
|
||||
char operator[](int i) const { return ptr_[i]; }
|
||||
|
||||
void remove_prefix(int n) {
|
||||
ptr_ += n;
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
void remove_suffix(int n) {
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
int compare(const StringPiece& x) const {
|
||||
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
||||
if (r == 0) {
|
||||
if (length_ < x.length_) r = -1;
|
||||
else if (length_ > x.length_) r = +1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
std::string as_string() const {
|
||||
return std::string(data(), size());
|
||||
}
|
||||
// We also define ToString() here, since many other string-like
|
||||
// interfaces name the routine that converts to a C++ string
|
||||
// "ToString", and it's confusing to have the method that does that
|
||||
// for a StringPiece be called "as_string()". We also leave the
|
||||
// "as_string()" method defined here for existing code.
|
||||
std::string ToString() const {
|
||||
return std::string(data(), size());
|
||||
}
|
||||
|
||||
void CopyToString(std::string* target) const;
|
||||
void AppendToString(std::string* target) const;
|
||||
|
||||
// Does "this" start with "x"
|
||||
bool starts_with(const StringPiece& x) const {
|
||||
return ((length_ >= x.length_) &&
|
||||
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
||||
}
|
||||
|
||||
// Does "this" end with "x"
|
||||
bool ends_with(const StringPiece& x) const {
|
||||
return ((length_ >= x.length_) &&
|
||||
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
||||
}
|
||||
|
||||
// standard STL container boilerplate
|
||||
typedef char value_type;
|
||||
typedef const char* pointer;
|
||||
typedef const char& reference;
|
||||
typedef const char& const_reference;
|
||||
typedef size_t size_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
static const size_type npos;
|
||||
typedef const char* const_iterator;
|
||||
typedef const char* iterator;
|
||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
||||
typedef std::reverse_iterator<iterator> reverse_iterator;
|
||||
iterator begin() const { return ptr_; }
|
||||
iterator end() const { return ptr_ + length_; }
|
||||
const_reverse_iterator rbegin() const {
|
||||
return const_reverse_iterator(ptr_ + length_);
|
||||
}
|
||||
const_reverse_iterator rend() const {
|
||||
return const_reverse_iterator(ptr_);
|
||||
}
|
||||
// STLS says return size_type, but Google says return int
|
||||
int max_size() const { return length_; }
|
||||
int capacity() const { return length_; }
|
||||
|
||||
int copy(char* buf, size_type n, size_type pos = 0) const;
|
||||
|
||||
int find(const StringPiece& s, size_type pos = 0) const;
|
||||
int find(char c, size_type pos = 0) const;
|
||||
int rfind(const StringPiece& s, size_type pos = npos) const;
|
||||
int rfind(char c, size_type pos = npos) const;
|
||||
|
||||
StringPiece substr(size_type pos, size_type n = npos) const;
|
||||
|
||||
static bool _equal(const StringPiece&, const StringPiece&);
|
||||
};
|
||||
|
||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||
return StringPiece::_equal(x, y);
|
||||
}
|
||||
|
||||
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
|
||||
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
||||
const int r = memcmp(x.data(), y.data(),
|
||||
std::min(x.size(), y.size()));
|
||||
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
||||
}
|
||||
|
||||
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
||||
return y < x;
|
||||
}
|
||||
|
||||
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x > y);
|
||||
}
|
||||
|
||||
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x < y);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
// allow StringPiece to be logged
|
||||
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
|
||||
|
||||
#endif // STRINGS_STRINGPIECE_H__
|
|
@ -0,0 +1,254 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
||||
//
|
||||
// Prog::BadSearchBacktrack is a backtracking regular expression search,
|
||||
// except that it remembers where it has been, trading a lot of
|
||||
// memory for a lot of time. It exists only for testing purposes.
|
||||
//
|
||||
// Let me repeat that.
|
||||
//
|
||||
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
|
||||
// - It uses a ton of memory.
|
||||
// - It uses a ton of stack.
|
||||
// - It uses CHECK and LOG(FATAL).
|
||||
// - It implements unanchored search by repeated anchored search.
|
||||
//
|
||||
// On the other hand, it is very simple and a good reference
|
||||
// implementation for the more complicated regexp packages.
|
||||
//
|
||||
// In BUILD, this file is linked into the ":testing" library,
|
||||
// not the main library, in order to make it harder to pick up
|
||||
// accidentally.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Backtracker holds the state for a backtracking search.
|
||||
//
|
||||
// Excluding the search parameters, the main search state
|
||||
// is just the "capture registers", which record, for the
|
||||
// current execution, the string position at which each
|
||||
// parenthesis was passed. cap_[0] and cap_[1] are the
|
||||
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
|
||||
//
|
||||
// To avoid infinite loops during backtracking on expressions
|
||||
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
|
||||
// pairs that have already been explored and are thus not worth
|
||||
// re-exploring if we get there via another path. Modern backtracking
|
||||
// libraries engineer their program representation differently, to make
|
||||
// such infinite loops possible to avoid without keeping a giant visited_
|
||||
// bitmap, but visited_ works fine for a reference implementation
|
||||
// and it has the nice benefit of making the search run in linear time.
|
||||
class Backtracker {
|
||||
public:
|
||||
explicit Backtracker(Prog* prog);
|
||||
~Backtracker();
|
||||
|
||||
bool Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
// Explores from instruction ip at string position p looking for a match.
|
||||
// Returns true if found (so that caller can stop trying other possibilities).
|
||||
bool Visit(int id, const char* p);
|
||||
|
||||
// Search parameters
|
||||
Prog* prog_; // program being run
|
||||
StringPiece text_; // text being searched
|
||||
StringPiece context_; // greater context of text being searched
|
||||
bool anchored_; // whether search is anchored at text.begin()
|
||||
bool longest_; // whether search wants leftmost-longest match
|
||||
bool endmatch_; // whether search must end at text.end()
|
||||
StringPiece *submatch_; // submatches to fill in
|
||||
int nsubmatch_; // # of submatches to fill in
|
||||
|
||||
// Search state
|
||||
const char* cap_[64]; // capture registers
|
||||
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||
int nvisited_; // # of words in bitmap
|
||||
};
|
||||
|
||||
Backtracker::Backtracker(Prog* prog)
|
||||
: prog_(prog),
|
||||
anchored_(false),
|
||||
longest_(false),
|
||||
endmatch_(false),
|
||||
submatch_(NULL),
|
||||
nsubmatch_(0),
|
||||
visited_(NULL),
|
||||
nvisited_(0) {
|
||||
}
|
||||
|
||||
Backtracker::~Backtracker() {
|
||||
delete[] visited_;
|
||||
}
|
||||
|
||||
// Runs a backtracking search.
|
||||
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
text_ = text;
|
||||
context_ = context;
|
||||
if (context_.begin() == NULL)
|
||||
context_ = text;
|
||||
if (prog_->anchor_start() && text.begin() > context_.begin())
|
||||
return false;
|
||||
if (prog_->anchor_end() && text.end() < context_.end())
|
||||
return false;
|
||||
anchored_ = anchored | prog_->anchor_start();
|
||||
longest_ = longest | prog_->anchor_end();
|
||||
endmatch_ = prog_->anchor_end();
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
CHECK(2*nsubmatch_ < arraysize(cap_));
|
||||
memset(cap_, 0, sizeof cap_);
|
||||
|
||||
// We use submatch_[0] for our own bookkeeping,
|
||||
// so it had better exist.
|
||||
StringPiece sp0;
|
||||
if (nsubmatch < 1) {
|
||||
submatch_ = &sp0;
|
||||
nsubmatch_ = 1;
|
||||
}
|
||||
submatch_[0] = NULL;
|
||||
|
||||
// Allocate new visited_ bitmap -- size is proportional
|
||||
// to text, so have to reallocate on each call to Search.
|
||||
delete[] visited_;
|
||||
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
|
||||
visited_ = new uint32[nvisited_];
|
||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
||||
|
||||
// Anchored search must start at text.begin().
|
||||
if (anchored_) {
|
||||
cap_[0] = text.begin();
|
||||
return Visit(prog_->start(), text.begin());
|
||||
}
|
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
||||
cap_[0] = p;
|
||||
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Explores from instruction ip at string position p looking for a match.
|
||||
// Return true if found (so that caller can stop trying other possibilities).
|
||||
bool Backtracker::Visit(int id, const char* p) {
|
||||
// Check bitmap. If we've already explored from here,
|
||||
// either it didn't match or it did but we're hoping for a better match.
|
||||
// Either way, don't go down that road again.
|
||||
CHECK(p <= text_.end());
|
||||
int n = id*(text_.size()+1) + (p - text_.begin());
|
||||
CHECK_LT(n/32, nvisited_);
|
||||
if (visited_[n/32] & (1 << (n&31)))
|
||||
return false;
|
||||
visited_[n/32] |= 1 << (n&31);
|
||||
|
||||
// Pick out byte at current position. If at end of string,
|
||||
// have to explore in hope of finishing a match. Use impossible byte -1.
|
||||
int c = -1;
|
||||
if (p < text_.end())
|
||||
c = *p & 0xFF;
|
||||
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
|
||||
return false; // not reached
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
// Try both possible next states: out is preferred to out1.
|
||||
if (Visit(ip->out(), p)) {
|
||||
if (longest_)
|
||||
Visit(ip->out1(), p);
|
||||
return true;
|
||||
}
|
||||
return Visit(ip->out1(), p);
|
||||
|
||||
case kInstByteRange:
|
||||
if (ip->Matches(c))
|
||||
return Visit(ip->out(), p+1);
|
||||
return false;
|
||||
|
||||
case kInstCapture:
|
||||
if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
|
||||
// Capture p to register, but save old value.
|
||||
const char* q = cap_[ip->cap()];
|
||||
cap_[ip->cap()] = p;
|
||||
bool ret = Visit(ip->out(), p);
|
||||
// Restore old value as we backtrack.
|
||||
cap_[ip->cap()] = q;
|
||||
return ret;
|
||||
}
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
return false;
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstNop:
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstMatch:
|
||||
// We found a match. If it's the best so far, record the
|
||||
// parameters in the caller's submatch_ array.
|
||||
if (endmatch_ && p != context_.end())
|
||||
return false;
|
||||
cap_[1] = p;
|
||||
if (submatch_[0].data() == NULL || // First match so far ...
|
||||
(longest_ && p > submatch_[0].end())) { // ... or better match
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
|
||||
}
|
||||
return true;
|
||||
|
||||
case kInstFail:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Runs a backtracking search.
|
||||
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
|
||||
const StringPiece& context,
|
||||
Anchor anchor,
|
||||
MatchKind kind,
|
||||
StringPiece* match,
|
||||
int nmatch) {
|
||||
// If full match, we ask for an anchored longest match
|
||||
// and then check that match[0] == text.
|
||||
// So make sure match[0] exists.
|
||||
StringPiece sp0;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch < 1) {
|
||||
match = &sp0;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the search.
|
||||
Backtracker b(this);
|
||||
bool anchored = anchor == kAnchored;
|
||||
bool longest = kind != kFirstMatch;
|
||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && match[0].end() != text.end())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,223 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test character class manipulations.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct CCTest {
|
||||
struct {
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
} add[10];
|
||||
int remove;
|
||||
struct {
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
} final[10];
|
||||
};
|
||||
|
||||
static CCTest tests[] = {
|
||||
{ { { 10, 20 }, {-1} }, -1,
|
||||
{ { 10, 20 }, {-1} } },
|
||||
|
||||
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 10, 30 }, {-1} } },
|
||||
|
||||
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 10, 40 }, {-1} } },
|
||||
|
||||
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 0, 50 }, {-1} } },
|
||||
|
||||
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
|
||||
{ { 5, 25 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
|
||||
{ { 10, 23 }, {-1} } },
|
||||
|
||||
// These check boundary cases during negation.
|
||||
{ { { 0, Runemax }, {-1} }, -1,
|
||||
{ { 0, Runemax }, {-1} } },
|
||||
|
||||
{ { { 0, 50 }, {-1} }, -1,
|
||||
{ { 0, 50 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, -1,
|
||||
{ { 50, Runemax }, {-1} } },
|
||||
|
||||
// Check RemoveAbove.
|
||||
{ { { 50, Runemax }, {-1} }, 255,
|
||||
{ { 50, 255 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, 65535,
|
||||
{ { 50, 65535 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, Runemax,
|
||||
{ { 50, Runemax }, {-1} } },
|
||||
|
||||
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
|
||||
{ { 50, 60 }, { 250, 255 }, {-1} } },
|
||||
|
||||
{ { { 50, 60 }, {-1} }, 255,
|
||||
{ { 50, 60 }, {-1} } },
|
||||
|
||||
{ { { 350, 360 }, {-1} }, 255,
|
||||
{ {-1} } },
|
||||
|
||||
{ { {-1} }, 255,
|
||||
{ {-1} } },
|
||||
};
|
||||
|
||||
template<class CharClass>
|
||||
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
|
||||
if (t == NULL) {
|
||||
printf("\t%s:", desc);
|
||||
} else {
|
||||
printf("\n");
|
||||
printf("CharClass added: [%s]", desc);
|
||||
for (int k = 0; t->add[k].lo >= 0; k++)
|
||||
printf(" %d-%d", t->add[k].lo, t->add[k].hi);
|
||||
printf("\n");
|
||||
if (t->remove >= 0)
|
||||
printf("Removed > %d\n", t->remove);
|
||||
printf("\twant:");
|
||||
for (int k = 0; t->final[k].lo >= 0; k++)
|
||||
printf(" %d-%d", t->final[k].lo, t->final[k].hi);
|
||||
printf("\n");
|
||||
printf("\thave:");
|
||||
}
|
||||
|
||||
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
|
||||
printf(" %d-%d", it->lo, it->hi);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
bool ShouldContain(CCTest *t, int x) {
|
||||
for (int j = 0; t->final[j].lo >= 0; j++)
|
||||
if (t->final[j].lo <= x && x <= t->final[j].hi)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
|
||||
|
||||
CharClass* Negate(CharClass *cc) {
|
||||
return cc->Negate();
|
||||
}
|
||||
|
||||
void Delete(CharClass* cc) {
|
||||
cc->Delete();
|
||||
}
|
||||
|
||||
CharClassBuilder* Negate(CharClassBuilder* cc) {
|
||||
CharClassBuilder* ncc = cc->Copy();
|
||||
ncc->Negate();
|
||||
return ncc;
|
||||
}
|
||||
|
||||
void Delete(CharClassBuilder* cc) {
|
||||
delete cc;
|
||||
}
|
||||
|
||||
template<class CharClass>
|
||||
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
|
||||
typename CharClass::iterator it = cc->begin();
|
||||
int size = 0;
|
||||
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
|
||||
if (it == cc->end() ||
|
||||
it->lo != t->final[j].lo ||
|
||||
it->hi != t->final[j].hi) {
|
||||
Broke(desc, t, cc);
|
||||
return false;
|
||||
}
|
||||
size += it->hi - it->lo + 1;
|
||||
}
|
||||
if (it != cc->end()) {
|
||||
Broke(desc, t, cc);
|
||||
return false;
|
||||
}
|
||||
if (cc->size() != size) {
|
||||
Broke(desc, t, cc);
|
||||
printf("wrong size: want %d have %d\n", size, cc->size());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 0; j < 101; j++) {
|
||||
if (j == 100)
|
||||
j = Runemax;
|
||||
if (ShouldContain(t, j) != cc->Contains(j)) {
|
||||
Broke(desc, t, cc);
|
||||
printf("want contains(%d)=%d, got %d\n",
|
||||
j, ShouldContain(t, j), cc->Contains(j));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
CharClass* ncc = Negate(cc);
|
||||
for (int j = 0; j < 101; j++) {
|
||||
if (j == 100)
|
||||
j = Runemax;
|
||||
if (ShouldContain(t, j) == ncc->Contains(j)) {
|
||||
Broke(desc, t, cc);
|
||||
Broke("ncc", NULL, ncc);
|
||||
printf("want ncc contains(%d)!=%d, got %d\n",
|
||||
j, ShouldContain(t, j), ncc->Contains(j));
|
||||
Delete(ncc);
|
||||
return false;
|
||||
}
|
||||
if (ncc->size() != Runemax+1 - cc->size()) {
|
||||
Broke(desc, t, cc);
|
||||
Broke("ncc", NULL, ncc);
|
||||
printf("ncc size should be %d is %d\n",
|
||||
Runemax+1 - cc->size(), ncc->size());
|
||||
Delete(ncc);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Delete(ncc);
|
||||
return true;
|
||||
}
|
||||
|
||||
TEST(TestCharClassBuilder, Adds) {
|
||||
int nfail = 0;
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
CharClassBuilder ccb;
|
||||
CCTest* t = &tests[i];
|
||||
for (int j = 0; t->add[j].lo >= 0; j++)
|
||||
ccb.AddRange(t->add[j].lo, t->add[j].hi);
|
||||
if (t->remove >= 0)
|
||||
ccb.RemoveAbove(t->remove);
|
||||
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
|
||||
nfail++;
|
||||
CharClass* cc = ccb.GetCharClass();
|
||||
if (!CorrectCC(cc, t, "before copy (CharClass)"))
|
||||
nfail++;
|
||||
cc->Delete();
|
||||
|
||||
CharClassBuilder *ccb1 = ccb.Copy();
|
||||
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
|
||||
nfail++;
|
||||
cc = ccb.GetCharClass();
|
||||
if (!CorrectCC(cc, t, "after copy (CharClass)"))
|
||||
nfail++;
|
||||
cc->Delete();
|
||||
delete ccb1;
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,171 @@
|
|||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test prog.cc, compile.cc
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/prog.h"
|
||||
|
||||
DEFINE_string(show, "", "regular expression to compile and dump");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Simple input/output tests checking that
|
||||
// the regexp compiles to the expected code.
|
||||
// These are just to sanity check the basic implementation.
|
||||
// The real confidence tests happen by testing the NFA/DFA
|
||||
// that run the compiled code.
|
||||
|
||||
struct Test {
|
||||
const char* regexp;
|
||||
const char* code;
|
||||
};
|
||||
|
||||
static Test tests[] = {
|
||||
{ "a",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "ab",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. byte [62-62] -> 3\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a|c",
|
||||
"3. alt -> 1 | 2\n"
|
||||
"1. byte [61-61] -> 4\n"
|
||||
"2. byte [63-63] -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "a|b",
|
||||
"1. byte [61-62] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "[ab]",
|
||||
"1. byte [61-62] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "a+",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. alt -> 1 | 3\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a+?",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. alt -> 3 | 1\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a*",
|
||||
"2. alt -> 1 | 3\n"
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a*?",
|
||||
"2. alt -> 3 | 1\n"
|
||||
"3. match! 0\n"
|
||||
"1. byte [61-61] -> 2\n" },
|
||||
{ "a?",
|
||||
"2. alt -> 1 | 3\n"
|
||||
"1. byte [61-61] -> 3\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a??",
|
||||
"2. alt -> 3 | 1\n"
|
||||
"3. match! 0\n"
|
||||
"1. byte [61-61] -> 3\n" },
|
||||
{ "a{4}",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. byte [61-61] -> 3\n"
|
||||
"3. byte [61-61] -> 4\n"
|
||||
"4. byte [61-61] -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "(a)",
|
||||
"2. capture 2 -> 1\n"
|
||||
"1. byte [61-61] -> 3\n"
|
||||
"3. capture 3 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "(?:a)",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "",
|
||||
"2. match! 0\n" },
|
||||
{ ".",
|
||||
"3. alt -> 1 | 2\n"
|
||||
"1. byte [00-09] -> 4\n"
|
||||
"2. byte [0b-ff] -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "[^ab]",
|
||||
"5. alt -> 3 | 4\n"
|
||||
"3. alt -> 1 | 2\n"
|
||||
"4. byte [63-ff] -> 6\n"
|
||||
"1. byte [00-09] -> 6\n"
|
||||
"2. byte [0b-60] -> 6\n"
|
||||
"6. match! 0\n" },
|
||||
{ "[Aa]",
|
||||
"1. byte/i [61-61] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
};
|
||||
|
||||
TEST(TestRegexpCompileToProg, Simple) {
|
||||
int failed = 0;
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
const re2::Test& t = tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
|
||||
if (re == NULL) {
|
||||
LOG(ERROR) << "Cannot parse: " << t.regexp;
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
if (prog == NULL) {
|
||||
LOG(ERROR) << "Cannot compile: " << t.regexp;
|
||||
re->Decref();
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
CHECK(re->CompileToProg(1) == NULL);
|
||||
string s = prog->Dump();
|
||||
if (s != t.code) {
|
||||
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
|
||||
LOG(ERROR) << "Want:\n" << t.code;
|
||||
LOG(ERROR) << "Got:\n" << s;
|
||||
failed++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(failed, 0);
|
||||
}
|
||||
|
||||
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
|
||||
// Once, erroneously split between 0x3f and 0x40 because it is
|
||||
// a 6-bit boundary.
|
||||
static struct UTF8ByteRange {
|
||||
int lo;
|
||||
int hi;
|
||||
} utf8ranges[] = {
|
||||
{ 0x00, 0x09 },
|
||||
{ 0x0A, 0x0A },
|
||||
{ 0x10, 0x7F },
|
||||
{ 0x80, 0x8F },
|
||||
{ 0x90, 0x9F },
|
||||
{ 0xA0, 0xBF },
|
||||
{ 0xC0, 0xC1 },
|
||||
{ 0xC2, 0xDF },
|
||||
{ 0xE0, 0xE0 },
|
||||
{ 0xE1, 0xEF },
|
||||
{ 0xF0, 0xF0 },
|
||||
{ 0xF1, 0xF3 },
|
||||
{ 0xF4, 0xF4 },
|
||||
{ 0xF5, 0xFF },
|
||||
};
|
||||
|
||||
TEST(TestCompile, ByteRanges) {
|
||||
Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
|
||||
EXPECT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
EXPECT_TRUE(prog != NULL);
|
||||
EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
|
||||
for (int i = 0; i < arraysize(utf8ranges); i++)
|
||||
for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
|
||||
EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,343 @@
|
|||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "util/thread.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
DECLARE_bool(re2_dfa_bail_when_slow);
|
||||
|
||||
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
|
||||
DEFINE_int32(repeat, 2, "Repetition count.");
|
||||
DEFINE_int32(threads, 4, "number of threads");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Check that multithreaded access to DFA class works.
|
||||
|
||||
// Helper thread: builds entire DFA for prog.
|
||||
class BuildThread : public Thread {
|
||||
public:
|
||||
BuildThread(Prog* prog) : prog_(prog) {}
|
||||
virtual void Run() {
|
||||
CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
|
||||
}
|
||||
|
||||
private:
|
||||
Prog* prog_;
|
||||
};
|
||||
|
||||
TEST(Multithreaded, BuildEntireDFA) {
|
||||
// Create regexp with 2^FLAGS_size states in DFA.
|
||||
string s = "a";
|
||||
for (int i = 0; i < FLAGS_size; i++)
|
||||
s += "[ab]";
|
||||
s += "b";
|
||||
|
||||
// Check that single-threaded code works.
|
||||
{
|
||||
//LOG(INFO) << s;
|
||||
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
CHECK(prog);
|
||||
BuildThread* t = new BuildThread(prog);
|
||||
t->SetJoinable(true);
|
||||
t->Start();
|
||||
t->Join();
|
||||
delete t;
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Build the DFA simultaneously in a bunch of threads.
|
||||
for (int i = 0; i < FLAGS_repeat; i++) {
|
||||
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
CHECK(prog);
|
||||
|
||||
vector<BuildThread*> threads;
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
BuildThread *t = new BuildThread(prog);
|
||||
t->SetJoinable(true);
|
||||
threads.push_back(t);
|
||||
}
|
||||
for (int j = 0; j < FLAGS_threads; j++)
|
||||
threads[j]->Start();
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
threads[j]->Join();
|
||||
delete threads[j];
|
||||
}
|
||||
|
||||
// One more compile, to make sure everything is okay.
|
||||
prog->BuildEntireDFA(Prog::kFirstMatch);
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
// Check that DFA size requirements are followed.
|
||||
// BuildEntireDFA will, like SearchDFA, stop building out
|
||||
// the DFA once the memory limits are reached.
|
||||
TEST(SingleThreaded, BuildEntireDFA) {
|
||||
// Create regexp with 2^30 states in DFA.
|
||||
string s = "a";
|
||||
for (int i = 0; i < 30; i++)
|
||||
s += "[ab]";
|
||||
s += "b";
|
||||
|
||||
//LOG(INFO) << s;
|
||||
Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
int max = 24;
|
||||
for (int i = 17; i < max; i++) {
|
||||
int limit = 1<<i;
|
||||
int usage, progusage, dfamem;
|
||||
{
|
||||
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
|
||||
Prog* prog = re->CompileToProg(limit);
|
||||
CHECK(prog);
|
||||
progusage = m.HeapGrowth();
|
||||
dfamem = prog->dfa_mem();
|
||||
prog->BuildEntireDFA(Prog::kFirstMatch);
|
||||
prog->BuildEntireDFA(Prog::kLongestMatch);
|
||||
usage = m.HeapGrowth();
|
||||
delete prog;
|
||||
}
|
||||
if (!UsingMallocCounter)
|
||||
continue;
|
||||
//LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
|
||||
// limit, progusage, dfamem, usage);
|
||||
CHECK_GT(usage, limit*9/10);
|
||||
CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Generates and returns a string over binary alphabet {0,1} that contains
|
||||
// all possible binary sequences of length n as subsequences. The obvious
|
||||
// brute force method would generate a string of length n * 2^n, but this
|
||||
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
|
||||
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
|
||||
// Such a string is useful for testing a DFA. If you have a DFA
|
||||
// where distinct last n bytes implies distinct states, then running on a
|
||||
// DeBruijn string causes the DFA to need to create a new state at every
|
||||
// position in the input, never reusing any states until it gets to the
|
||||
// end of the string. This is the worst possible case for DFA execution.
|
||||
static string DeBruijnString(int n) {
|
||||
CHECK_LT(n, 8*sizeof(int));
|
||||
CHECK_GT(n, 0);
|
||||
|
||||
vector<bool> did(1<<n);
|
||||
for (int i = 0; i < 1<<n; i++)
|
||||
did[i] = false;
|
||||
|
||||
string s;
|
||||
for (int i = 0; i < n-1; i++)
|
||||
s.append("0");
|
||||
int bits = 0;
|
||||
int mask = (1<<n) - 1;
|
||||
for (int i = 0; i < (1<<n); i++) {
|
||||
bits <<= 1;
|
||||
bits &= mask;
|
||||
if (!did[bits|1]) {
|
||||
bits |= 1;
|
||||
s.append("1");
|
||||
} else {
|
||||
s.append("0");
|
||||
}
|
||||
CHECK(!did[bits]);
|
||||
did[bits] = true;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Test that the DFA gets the right result even if it runs
|
||||
// out of memory during a search. The regular expression
|
||||
// 0[01]{n}$ matches a binary string of 0s and 1s only if
|
||||
// the (n+1)th-to-last character is a 0. Matching this in
|
||||
// a single forward pass (as done by the DFA) requires
|
||||
// keeping one bit for each of the last n+1 characters
|
||||
// (whether each was a 0), or 2^(n+1) possible states.
|
||||
// If we run this regexp to search in a string that contains
|
||||
// every possible n-character binary string as a substring,
|
||||
// then it will have to run through at least 2^n states.
|
||||
// States are big data structures -- certainly more than 1 byte --
|
||||
// so if the DFA can search correctly while staying within a
|
||||
// 2^n byte limit, it must be handling out-of-memory conditions
|
||||
// gracefully.
|
||||
TEST(SingleThreaded, SearchDFA) {
|
||||
// Choice of n is mostly arbitrary, except that:
|
||||
// * making n too big makes the test run for too long.
|
||||
// * making n too small makes the DFA refuse to run,
|
||||
// because it has so little memory compared to the program size.
|
||||
// Empirically, n = 18 is a good compromise between the two.
|
||||
const int n = 18;
|
||||
|
||||
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
|
||||
Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
|
||||
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
|
||||
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
|
||||
string no_match = DeBruijnString(n);
|
||||
string match = no_match + "0";
|
||||
|
||||
// The De Bruijn string is the worst case input for this regexp.
|
||||
// By default, the DFA will notice that it is flushing its cache
|
||||
// too frequently and will bail out early, so that RE2 can use the
|
||||
// NFA implementation instead. (The DFA loses its speed advantage
|
||||
// if it can't get a good cache hit rate.)
|
||||
// Tell the DFA to trudge along instead.
|
||||
FLAGS_re2_dfa_bail_when_slow = false;
|
||||
|
||||
int64 usage;
|
||||
int64 peak_usage;
|
||||
{
|
||||
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
CHECK(prog);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
bool matched, failed = false;
|
||||
matched = prog->SearchDFA(match, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(matched);
|
||||
matched = prog->SearchDFA(no_match, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(!matched);
|
||||
}
|
||||
usage = m.HeapGrowth();
|
||||
peak_usage = m.PeakHeapGrowth();
|
||||
delete prog;
|
||||
}
|
||||
re->Decref();
|
||||
|
||||
if (!UsingMallocCounter)
|
||||
return;
|
||||
//LOG(INFO) << "usage " << usage << " " << peak_usage;
|
||||
CHECK_LT(usage, 1<<n);
|
||||
CHECK_LT(peak_usage, 1<<n);
|
||||
}
|
||||
|
||||
// Helper thread: searches for match, which should match,
|
||||
// and no_match, which should not.
|
||||
class SearchThread : public Thread {
|
||||
public:
|
||||
SearchThread(Prog* prog, const StringPiece& match,
|
||||
const StringPiece& no_match)
|
||||
: prog_(prog), match_(match), no_match_(no_match) {}
|
||||
|
||||
virtual void Run() {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
bool matched, failed = false;
|
||||
matched = prog_->SearchDFA(match_, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(matched);
|
||||
matched = prog_->SearchDFA(no_match_, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(!matched);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
Prog* prog_;
|
||||
StringPiece match_;
|
||||
StringPiece no_match_;
|
||||
};
|
||||
|
||||
TEST(Multithreaded, SearchDFA) {
|
||||
// Same as single-threaded test above.
|
||||
const int n = 18;
|
||||
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
|
||||
Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
string no_match = DeBruijnString(n);
|
||||
string match = no_match + "0";
|
||||
FLAGS_re2_dfa_bail_when_slow = false;
|
||||
|
||||
// Check that single-threaded code works.
|
||||
{
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
CHECK(prog);
|
||||
SearchThread* t = new SearchThread(prog, match, no_match);
|
||||
t->SetJoinable(true);
|
||||
t->Start();
|
||||
t->Join();
|
||||
delete t;
|
||||
delete prog;
|
||||
}
|
||||
|
||||
// Run the search simultaneously in a bunch of threads.
|
||||
// Reuse same flags for Multithreaded.BuildDFA above.
|
||||
for (int i = 0; i < FLAGS_repeat; i++) {
|
||||
//LOG(INFO) << "Search " << i;
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
CHECK(prog);
|
||||
|
||||
vector<SearchThread*> threads;
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
SearchThread *t = new SearchThread(prog, match, no_match);
|
||||
t->SetJoinable(true);
|
||||
threads.push_back(t);
|
||||
}
|
||||
for (int j = 0; j < FLAGS_threads; j++)
|
||||
threads[j]->Start();
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
threads[j]->Join();
|
||||
delete threads[j];
|
||||
}
|
||||
delete prog;
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
struct ReverseTest {
|
||||
const char *regexp;
|
||||
const char *text;
|
||||
bool match;
|
||||
};
|
||||
|
||||
// Test that reverse DFA handles anchored/unanchored correctly.
|
||||
// It's in the DFA interface but not used by RE2.
|
||||
ReverseTest reverse_tests[] = {
|
||||
{ "\\A(a|b)", "abc", true },
|
||||
{ "(a|b)\\z", "cba", true },
|
||||
{ "\\A(a|b)", "cba", false },
|
||||
{ "(a|b)\\z", "abc", false },
|
||||
};
|
||||
|
||||
TEST(DFA, ReverseMatch) {
|
||||
int nfail = 0;
|
||||
for (int i = 0; i < arraysize(reverse_tests); i++) {
|
||||
const ReverseTest& t = reverse_tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog *prog = re->CompileToReverseProg(0);
|
||||
CHECK(prog);
|
||||
bool failed = false;
|
||||
bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
if (matched != t.match) {
|
||||
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
|
||||
nfail++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,164 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Dump the regexp into a string showing structure.
|
||||
// Tested by parse_unittest.cc
|
||||
|
||||
// This function traverses the regexp recursively,
|
||||
// meaning that on inputs like Regexp::Simplify of
|
||||
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
|
||||
// it takes time and space exponential in the size of the
|
||||
// original regular expression. It can also use stack space
|
||||
// linear in the size of the regular expression for inputs
|
||||
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
|
||||
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
|
||||
// As a result, Dump is provided only in the testing
|
||||
// library (see BUILD).
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
// Cause a link error if this file is used outside of testing.
|
||||
DECLARE_string(test_tmpdir);
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const char* kOpcodeNames[] = {
|
||||
"bad",
|
||||
"no",
|
||||
"emp",
|
||||
"lit",
|
||||
"str",
|
||||
"cat",
|
||||
"alt",
|
||||
"star",
|
||||
"plus",
|
||||
"que",
|
||||
"rep",
|
||||
"cap",
|
||||
"dot",
|
||||
"byte",
|
||||
"bol",
|
||||
"eol",
|
||||
"wb", // kRegexpWordBoundary
|
||||
"nwb", // kRegexpNoWordBoundary
|
||||
"bot",
|
||||
"eot",
|
||||
"cc",
|
||||
"match",
|
||||
};
|
||||
|
||||
// Create string representation of regexp with explicit structure.
|
||||
// Nothing pretty, just for testing.
|
||||
static void DumpRegexpAppending(Regexp* re, string* s) {
|
||||
if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
|
||||
StringAppendF(s, "op%d", re->op());
|
||||
} else {
|
||||
switch (re->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
s->append("n");
|
||||
break;
|
||||
}
|
||||
s->append(kOpcodeNames[re->op()]);
|
||||
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
|
||||
Rune r = re->rune();
|
||||
if ('a' <= r && r <= 'z')
|
||||
s->append("fold");
|
||||
}
|
||||
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Rune r = re->runes()[i];
|
||||
if ('a' <= r && r <= 'z') {
|
||||
s->append("fold");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
s->append("{");
|
||||
switch (re->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpEndText:
|
||||
if (!(re->parse_flags() & Regexp::WasDollar)) {
|
||||
s->append("\\z");
|
||||
}
|
||||
break;
|
||||
case kRegexpLiteral: {
|
||||
Rune r = re->rune();
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
s->append(buf);
|
||||
break;
|
||||
}
|
||||
case kRegexpLiteralString:
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Rune r = re->runes()[i];
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
s->append(buf);
|
||||
}
|
||||
break;
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate:
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
DumpRegexpAppending(re->sub()[i], s);
|
||||
break;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpCapture:
|
||||
if (re->name()) {
|
||||
s->append(*re->name());
|
||||
s->append(":");
|
||||
}
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpRepeat:
|
||||
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpCharClass: {
|
||||
string sep;
|
||||
for (CharClass::iterator it = re->cc()->begin();
|
||||
it != re->cc()->end(); ++it) {
|
||||
RuneRange rr = *it;
|
||||
s->append(sep);
|
||||
if (rr.lo == rr.hi)
|
||||
s->append(StringPrintf("%#x", rr.lo));
|
||||
else
|
||||
s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
|
||||
sep = " ";
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
s->append("}");
|
||||
}
|
||||
|
||||
string Regexp::Dump() {
|
||||
string s;
|
||||
|
||||
// Make sure being called from a unit test.
|
||||
if (FLAGS_test_tmpdir.empty()) {
|
||||
LOG(ERROR) << "Cannot use except for testing.";
|
||||
return s;
|
||||
}
|
||||
|
||||
DumpRegexpAppending(this, &s);
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,42 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
DECLARE_string(regexp_engines);
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test simple repetition operators
|
||||
TEST(Repetition, Simple) {
|
||||
vector<string> ops = Split(" ",
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
|
||||
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
|
||||
"%s* %s+ %s? %s*? %s+? %s??");
|
||||
ExhaustiveTest(3, 2, Explode("abc."), ops,
|
||||
6, Explode("ab"), "(?:%s)", "");
|
||||
ExhaustiveTest(3, 2, Explode("abc."), ops,
|
||||
40, Explode("a"), "(?:%s)", "");
|
||||
}
|
||||
|
||||
// Test capturing parens -- (a) -- inside repetition operators
|
||||
TEST(Repetition, Capturing) {
|
||||
vector<string> ops = Split(" ",
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
|
||||
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
|
||||
"%s* %s+ %s? %s*? %s+? %s??");
|
||||
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
|
||||
7, Explode("ab"), "(?:%s)", "");
|
||||
|
||||
// This would be a great test, but it runs forever when PCRE is enabled.
|
||||
if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL)
|
||||
ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
|
||||
100, Explode("a"), "(?:%s)", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
DECLARE_string(regexp_engines);
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test empty string matches (aka "(?:)")
|
||||
TEST(EmptyString, Exhaustive) {
|
||||
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
5, Split("", "ab"), "", "");
|
||||
}
|
||||
|
||||
// Test escaped versions of regexp syntax.
|
||||
TEST(Punctuation, Literals) {
|
||||
vector<string> alphabet = Explode("()*+?{}[]\\^$.");
|
||||
vector<string> escaped = alphabet;
|
||||
for (int i = 0; i < escaped.size(); i++)
|
||||
escaped[i] = "\\" + escaped[i];
|
||||
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
|
||||
2, alphabet, "", "");
|
||||
}
|
||||
|
||||
// Test ^ $ . \A \z in presence of line endings.
|
||||
// Have to wrap the empty-width ones in (?:) so that
|
||||
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
|
||||
TEST(LineEnds, Exhaustive) {
|
||||
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
4, Explode("ab\n"), "", "");
|
||||
}
|
||||
|
||||
// Test what does and does not match \n.
|
||||
// This would be a good test, except that PCRE seems to have a bug:
|
||||
// in single-byte character set mode (the default),
|
||||
// [^a] matches \n, but in UTF-8 mode it does not.
|
||||
// So when we run the test, the tester complains that
|
||||
// we don't agree with PCRE, but it's PCRE that is at fault.
|
||||
// For what it's worth, Perl gets this right (matches
|
||||
// regardless of whether UTF-8 input is selected):
|
||||
//
|
||||
// #!/usr/bin/perl
|
||||
// use POSIX qw(locale_h);
|
||||
// print "matches in latin1\n" if "\n" =~ /[^a]/;
|
||||
// setlocale("en_US.utf8");
|
||||
// print "matches in utf8\n" if "\n" =~ /[^a]/;
|
||||
//
|
||||
// The rule chosen for RE2 is that by default, like Perl,
|
||||
// dot does not match \n but negated character classes [^a] do.
|
||||
// (?s) will allow dot to match \n; there is no way in RE2
|
||||
// to stop [^a] from matching \n, though the underlying library
|
||||
// provides a mechanism, and RE2 could add new syntax if needed.
|
||||
//
|
||||
// TEST(Newlines, Exhaustive) {
|
||||
// vector<string> empty_vector;
|
||||
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
|
||||
// RegexpGenerator::EgrepOps(),
|
||||
// 4, Explode("a\n"), "");
|
||||
// }
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test simple character classes by themselves.
|
||||
TEST(CharacterClasses, Exhaustive) {
|
||||
vector<string> atoms = Split(" ",
|
||||
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
|
||||
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
|
||||
5, Explode("ab"), "", "");
|
||||
}
|
||||
|
||||
// Test simple character classes inside a___b (for example, a[a]b).
|
||||
TEST(CharacterClasses, ExhaustiveAB) {
|
||||
vector<string> atoms = Split(" ",
|
||||
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
|
||||
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
|
||||
5, Explode("ab"), "a%sb", "");
|
||||
}
|
||||
|
||||
// Returns UTF8 for Rune r
|
||||
static string UTF8(Rune r) {
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
return string(buf);
|
||||
}
|
||||
|
||||
// Returns a vector of "interesting" UTF8 characters.
|
||||
// Unicode is now too big to just return all of them,
|
||||
// so UTF8Characters return a set likely to be good test cases.
|
||||
static const vector<string>& InterestingUTF8() {
|
||||
static bool init;
|
||||
static vector<string> v;
|
||||
|
||||
if (init)
|
||||
return v;
|
||||
|
||||
init = true;
|
||||
// All the Latin1 equivalents are interesting.
|
||||
for (int i = 1; i < 256; i++)
|
||||
v.push_back(UTF8(i));
|
||||
|
||||
// After that, the codes near bit boundaries are
|
||||
// interesting, because they span byte sequence lengths.
|
||||
for (int j = 0; j < 8; j++)
|
||||
v.push_back(UTF8(256 + j));
|
||||
for (int i = 512; i < Runemax; i <<= 1)
|
||||
for (int j = -8; j < 8; j++)
|
||||
v.push_back(UTF8(i + j));
|
||||
|
||||
// The codes near Runemax, including Runemax itself, are interesting.
|
||||
for (int j = -8; j <= 0; j++)
|
||||
v.push_back(UTF8(Runemax + j));
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// Test interesting UTF-8 characters against character classes.
|
||||
TEST(InterestingUTF8, SingleOps) {
|
||||
vector<string> atoms = Split(" ",
|
||||
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
|
||||
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
|
||||
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
|
||||
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
|
||||
vector<string> ops; // no ops
|
||||
ExhaustiveTest(1, 0, atoms, ops,
|
||||
1, InterestingUTF8(), "", "");
|
||||
}
|
||||
|
||||
// Test interesting UTF-8 characters against character classes,
|
||||
// but wrap everything inside AB.
|
||||
TEST(InterestingUTF8, AB) {
|
||||
vector<string> atoms = Split(" ",
|
||||
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
|
||||
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
|
||||
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
|
||||
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
|
||||
vector<string> ops; // no ops
|
||||
vector<string> alpha = InterestingUTF8();
|
||||
for (int i = 0; i < alpha.size(); i++)
|
||||
alpha[i] = "a" + alpha[i] + "b";
|
||||
ExhaustiveTest(1, 0, atoms, ops,
|
||||
1, alpha, "a%sb", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
DECLARE_string(regexp_engines);
|
||||
|
||||
// Test very simple expressions.
|
||||
TEST(EgrepLiterals, Lowercase) {
|
||||
EgrepTest(3, 2, "abc.", 3, "abc", "");
|
||||
}
|
||||
|
||||
// Test mixed-case expressions.
|
||||
TEST(EgrepLiterals, MixedCase) {
|
||||
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
|
||||
}
|
||||
|
||||
// Test mixed-case in case-insensitive mode.
|
||||
TEST(EgrepLiterals, FoldCase) {
|
||||
// The punctuation characters surround A-Z and a-z
|
||||
// in the ASCII table. This looks for bugs in the
|
||||
// bytemap range code in the DFA.
|
||||
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
|
||||
}
|
||||
|
||||
// Test very simple expressions.
|
||||
TEST(EgrepLiterals, UTF8) {
|
||||
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
|
||||
// a maximum regular expression length, and a maximum number of letters
|
||||
// that can appear in the regular expression. Given these parameters,
|
||||
// it tries every possible regular expression and string, verifying that
|
||||
// the NFA, DFA, and a trivial backtracking implementation agree about
|
||||
// the location of the match.
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifndef LOGGING
|
||||
#define LOGGING 0
|
||||
#endif
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/tester.h"
|
||||
|
||||
DEFINE_bool(show_regexps, false, "show regexps during testing");
|
||||
|
||||
DEFINE_int32(max_bad_regexp_inputs, 1,
|
||||
"Stop testing a regular expression after finding this many "
|
||||
"strings that break it.");
|
||||
|
||||
// Compiled in debug mode, the usual tests run for over an hour.
|
||||
// Have to cut it down to make the unit test machines happy.
|
||||
DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static char* escape(const StringPiece& sp) {
|
||||
static char buf[512];
|
||||
char* p = buf;
|
||||
*p++ = '\"';
|
||||
for (int i = 0; i < sp.size(); i++) {
|
||||
if(p+5 >= buf+sizeof buf)
|
||||
LOG(FATAL) << "ExhaustiveTester escape: too long";
|
||||
if(sp[i] == '\\' || sp[i] == '\"') {
|
||||
*p++ = '\\';
|
||||
*p++ = sp[i];
|
||||
} else if(sp[i] == '\n') {
|
||||
*p++ = '\\';
|
||||
*p++ = 'n';
|
||||
} else {
|
||||
*p++ = sp[i];
|
||||
}
|
||||
}
|
||||
*p++ = '\"';
|
||||
*p = '\0';
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
|
||||
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
|
||||
printf("-");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (i > 0)
|
||||
printf(" ");
|
||||
if (m[i].begin() == NULL)
|
||||
printf("-");
|
||||
else
|
||||
printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin()));
|
||||
}
|
||||
}
|
||||
|
||||
// Processes a single generated regexp.
|
||||
// Compiles it using Regexp interface and PCRE, and then
|
||||
// checks that NFA, DFA, and PCRE all return the same results.
|
||||
void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
|
||||
regexps_++;
|
||||
string regexp = const_regexp;
|
||||
if (!topwrapper_.empty())
|
||||
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
|
||||
|
||||
if (FLAGS_show_regexps) {
|
||||
printf("\r%s", regexp.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
if (LOGGING) {
|
||||
// Write out test cases and answers for use in testing
|
||||
// other implementations, such as Go's regexp package.
|
||||
if (randomstrings_)
|
||||
LOG(ERROR) << "Cannot log with random strings.";
|
||||
if (regexps_ == 1) { // first
|
||||
printf("strings\n");
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext())
|
||||
printf("%s\n", escape(strgen_.Next()));
|
||||
printf("regexps\n");
|
||||
}
|
||||
printf("%s\n", escape(regexp));
|
||||
|
||||
RE2 re(regexp);
|
||||
RE2::Options longest;
|
||||
longest.set_longest_match(true);
|
||||
RE2 relongest(regexp, longest);
|
||||
int ngroup = re.NumberOfCapturingGroups()+1;
|
||||
StringPiece* group = new StringPiece[ngroup];
|
||||
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext()) {
|
||||
StringPiece input = strgen_.Next();
|
||||
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
|
||||
printf(";");
|
||||
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
|
||||
printf(";");
|
||||
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
|
||||
printf(";");
|
||||
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
|
||||
printf("\n");
|
||||
}
|
||||
delete[] group;
|
||||
return;
|
||||
}
|
||||
|
||||
Tester tester(regexp);
|
||||
if (tester.error())
|
||||
return;
|
||||
|
||||
strgen_.Reset();
|
||||
strgen_.GenerateNULL();
|
||||
if (randomstrings_)
|
||||
strgen_.Random(stringseed_, stringcount_);
|
||||
int bad_inputs = 0;
|
||||
while (strgen_.HasNext()) {
|
||||
tests_++;
|
||||
if (!tester.TestInput(strgen_.Next())) {
|
||||
failures_++;
|
||||
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs an exhaustive test on the given parameters.
|
||||
void ExhaustiveTest(int maxatoms, int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen, const vector<string>& stralphabet,
|
||||
const string& wrapper,
|
||||
const string& topwrapper) {
|
||||
if (DEBUG_MODE && FLAGS_quick_debug_mode) {
|
||||
if (maxatoms > 1)
|
||||
maxatoms--;
|
||||
if (maxops > 1)
|
||||
maxops--;
|
||||
if (maxstrlen > 1)
|
||||
maxstrlen--;
|
||||
}
|
||||
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
|
||||
maxstrlen, stralphabet, wrapper,
|
||||
topwrapper);
|
||||
t.Generate();
|
||||
if (!LOGGING) {
|
||||
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
|
||||
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
|
||||
}
|
||||
EXPECT_EQ(0, t.failures());
|
||||
}
|
||||
|
||||
// Runs an exhaustive test using the given parameters and
|
||||
// the basic egrep operators.
|
||||
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
|
||||
int maxstrlen, const string& stralphabet,
|
||||
const string& wrapper) {
|
||||
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
|
||||
|
||||
for (int i = 0; i < arraysize(tops); i++) {
|
||||
ExhaustiveTest(maxatoms, maxops,
|
||||
Split("", alphabet),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
maxstrlen,
|
||||
Split("", stralphabet),
|
||||
wrapper,
|
||||
tops[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,85 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
|
||||
#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/util.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Exhaustive regular expression test: generate all regexps within parameters,
|
||||
// then generate all strings of a given length over a given alphabet,
|
||||
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
|
||||
// each possible string, and if so, where the match is.
|
||||
//
|
||||
// Can also be used in a "random" mode that generates a given number
|
||||
// of random regexp and strings, allowing testing of larger expressions
|
||||
// and inputs.
|
||||
class ExhaustiveTester : public RegexpGenerator {
|
||||
public:
|
||||
ExhaustiveTester(int maxatoms,
|
||||
int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen,
|
||||
const vector<string>& stralphabet,
|
||||
const string& wrapper,
|
||||
const string& topwrapper)
|
||||
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
|
||||
strgen_(maxstrlen, stralphabet),
|
||||
wrapper_(wrapper),
|
||||
topwrapper_(topwrapper),
|
||||
regexps_(0), tests_(0), failures_(0),
|
||||
randomstrings_(0), stringseed_(0), stringcount_(0) { }
|
||||
|
||||
int regexps() { return regexps_; }
|
||||
int tests() { return tests_; }
|
||||
int failures() { return failures_; }
|
||||
|
||||
// Needed for RegexpGenerator interface.
|
||||
void HandleRegexp(const string& regexp);
|
||||
|
||||
// Causes testing to generate random input strings.
|
||||
void RandomStrings(int32 seed, int32 count) {
|
||||
randomstrings_ = true;
|
||||
stringseed_ = seed;
|
||||
stringcount_ = count;
|
||||
}
|
||||
|
||||
private:
|
||||
StringGenerator strgen_;
|
||||
string wrapper_; // Regexp wrapper - either empty or has one %s.
|
||||
string topwrapper_; // Regexp top-level wrapper.
|
||||
int regexps_; // Number of HandleRegexp calls
|
||||
int tests_; // Number of regexp tests.
|
||||
int failures_; // Number of tests failed.
|
||||
|
||||
bool randomstrings_; // Whether to use random strings
|
||||
int32 stringseed_; // If so, the seed.
|
||||
int stringcount_; // If so, how many to generate.
|
||||
DISALLOW_EVIL_CONSTRUCTORS(ExhaustiveTester);
|
||||
};
|
||||
|
||||
// Runs an exhaustive test on the given parameters.
|
||||
void ExhaustiveTest(int maxatoms, int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen, const vector<string>& stralphabet,
|
||||
const string& wrapper,
|
||||
const string& topwrapper);
|
||||
|
||||
// Runs an exhaustive test using the given parameters and
|
||||
// the basic egrep operators.
|
||||
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
|
||||
int maxstrlen, const string& stralphabet,
|
||||
const string& wrapper);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__
|
|
@ -0,0 +1,258 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/filtered_re2.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct FilterTestVars {
|
||||
vector<string> atoms;
|
||||
vector<int> atom_indices;
|
||||
vector<int> matches;
|
||||
RE2::Options opts;
|
||||
FilteredRE2 f;
|
||||
};
|
||||
|
||||
TEST(FilteredRE2Test, EmptyTest) {
|
||||
FilterTestVars v;
|
||||
v.f.AllMatches("foo", v.atom_indices, &v.matches);
|
||||
EXPECT_EQ(0, v.matches.size());
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, SmallOrTest) {
|
||||
FLAGS_filtered_re2_min_atom_len = 4;
|
||||
|
||||
FilterTestVars v;
|
||||
int id;
|
||||
v.f.Add("(foo|bar)", v.opts, &id);
|
||||
|
||||
v.f.Compile(&v.atoms);
|
||||
EXPECT_EQ(0, v.atoms.size());
|
||||
|
||||
v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
|
||||
EXPECT_EQ(1, v.matches.size());
|
||||
EXPECT_EQ(id, v.matches[0]);
|
||||
}
|
||||
|
||||
struct AtomTest {
|
||||
const char* testname;
|
||||
// If any test needs more than this many regexps or atoms, increase
|
||||
// the size of the corresponding array.
|
||||
const char* regexps[20];
|
||||
const char* atoms[20];
|
||||
};
|
||||
|
||||
AtomTest atom_tests[] = {
|
||||
{
|
||||
// This test checks to make sure empty patterns are allowed.
|
||||
"CheckEmptyPattern",
|
||||
{""},
|
||||
{}
|
||||
}, {
|
||||
// This test checks that all atoms of length greater than min length
|
||||
// are found, and no atoms that are of smaller length are found.
|
||||
"AllAtomsGtMinLengthFound", {
|
||||
"(abc123|def456|ghi789).*mnop[x-z]+",
|
||||
"abc..yyy..zz",
|
||||
"mnmnpp[a-z]+PPP"
|
||||
}, {
|
||||
"abc123",
|
||||
"def456",
|
||||
"ghi789",
|
||||
"mnop",
|
||||
"abc",
|
||||
"yyy",
|
||||
"mnmnpp",
|
||||
"ppp"
|
||||
}
|
||||
}, {
|
||||
// Test to make sure that any atoms that have another atom as a
|
||||
// substring in an OR are removed; that is, only the shortest
|
||||
// substring is kept.
|
||||
"SubstrAtomRemovesSuperStrInOr", {
|
||||
"(abc123|abc|ghi789|abc1234).*[x-z]+",
|
||||
"abcd..yyy..yyyzzz",
|
||||
"mnmnpp[a-z]+PPP"
|
||||
}, {
|
||||
"abc",
|
||||
"ghi789",
|
||||
"abcd",
|
||||
"yyy",
|
||||
"yyyzzz",
|
||||
"mnmnpp",
|
||||
"ppp"
|
||||
}
|
||||
}, {
|
||||
// Test character class expansion.
|
||||
"CharClassExpansion", {
|
||||
"m[a-c][d-f]n.*[x-z]+",
|
||||
"[x-y]bcde[ab]"
|
||||
}, {
|
||||
"madn", "maen", "mafn",
|
||||
"mbdn", "mben", "mbfn",
|
||||
"mcdn", "mcen", "mcfn",
|
||||
"xbcdea", "xbcdeb",
|
||||
"ybcdea", "ybcdeb"
|
||||
}
|
||||
}, {
|
||||
// Test upper/lower of non-ASCII.
|
||||
"UnicodeLower", {
|
||||
"(?i)ΔδΠϖπΣςσ",
|
||||
"ΛΜΝΟΠ",
|
||||
"ψρστυ",
|
||||
}, {
|
||||
"δδπππσσσ",
|
||||
"λμνοπ",
|
||||
"ψρστυ",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
void AddRegexpsAndCompile(const char* regexps[],
|
||||
int n,
|
||||
struct FilterTestVars* v) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
int id;
|
||||
v->f.Add(regexps[i], v->opts, &id);
|
||||
}
|
||||
v->f.Compile(&v->atoms);
|
||||
}
|
||||
|
||||
bool CheckExpectedAtoms(const char* atoms[],
|
||||
int n,
|
||||
const char* testname,
|
||||
struct FilterTestVars* v) {
|
||||
vector<string> expected;
|
||||
for (int i = 0; i < n; i++)
|
||||
expected.push_back(atoms[i]);
|
||||
|
||||
bool pass = expected.size() == v->atoms.size();
|
||||
|
||||
sort(v->atoms.begin(), v->atoms.end());
|
||||
sort(expected.begin(), expected.end());
|
||||
for (int i = 0; pass && i < n; i++)
|
||||
pass = pass && expected[i] == v->atoms[i];
|
||||
|
||||
if (!pass) {
|
||||
LOG(WARNING) << "Failed " << testname;
|
||||
LOG(WARNING) << "Expected #atoms = " << expected.size();
|
||||
for (int i = 0; i < expected.size(); i++)
|
||||
LOG(WARNING) << expected[i];
|
||||
LOG(WARNING) << "Found #atoms = " << v->atoms.size();
|
||||
for (int i = 0; i < v->atoms.size(); i++)
|
||||
LOG(WARNING) << v->atoms[i];
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, AtomTests) {
|
||||
FLAGS_filtered_re2_min_atom_len = 3;
|
||||
|
||||
int nfail = 0;
|
||||
for (int i = 0; i < arraysize(atom_tests); i++) {
|
||||
FilterTestVars v;
|
||||
AtomTest* t = &atom_tests[i];
|
||||
int natom, nregexp;
|
||||
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
|
||||
if (t->regexps[nregexp] == NULL)
|
||||
break;
|
||||
for (natom = 0; natom < arraysize(t->atoms); natom++)
|
||||
if (t->atoms[natom] == NULL)
|
||||
break;
|
||||
AddRegexpsAndCompile(t->regexps, nregexp, &v);
|
||||
if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
|
||||
nfail++;
|
||||
}
|
||||
EXPECT_EQ(0, nfail);
|
||||
}
|
||||
|
||||
void FindAtomIndices(const vector<string> atoms,
|
||||
const vector<string> matched_atoms,
|
||||
vector<int>* atom_indices) {
|
||||
atom_indices->clear();
|
||||
for (int i = 0; i < matched_atoms.size(); i++) {
|
||||
int j = 0;
|
||||
for (; j < atoms.size(); j++) {
|
||||
if (matched_atoms[i] == atoms[j]) {
|
||||
atom_indices->push_back(j);
|
||||
break;
|
||||
}
|
||||
EXPECT_LT(j, atoms.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, MatchEmptyPattern) {
|
||||
FLAGS_filtered_re2_min_atom_len = 3;
|
||||
|
||||
FilterTestVars v;
|
||||
AtomTest* t = &atom_tests[0];
|
||||
// We are using the regexps used in one of the atom tests
|
||||
// for this test. Adding the EXPECT here to make sure
|
||||
// the index we use for the test is for the correct test.
|
||||
EXPECT_EQ("CheckEmptyPattern", string(t->testname));
|
||||
int nregexp;
|
||||
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
|
||||
if (t->regexps[nregexp] == NULL)
|
||||
break;
|
||||
AddRegexpsAndCompile(t->regexps, nregexp, &v);
|
||||
string text = "0123";
|
||||
vector<int> atom_ids;
|
||||
vector<int> matching_regexps;
|
||||
EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
|
||||
}
|
||||
|
||||
TEST(FilteredRE2Test, MatchTests) {
|
||||
FLAGS_filtered_re2_min_atom_len = 3;
|
||||
|
||||
FilterTestVars v;
|
||||
AtomTest* t = &atom_tests[2];
|
||||
// We are using the regexps used in one of the atom tests
|
||||
// for this test.
|
||||
EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
|
||||
int nregexp;
|
||||
for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
|
||||
if (t->regexps[nregexp] == NULL)
|
||||
break;
|
||||
AddRegexpsAndCompile(t->regexps, nregexp, &v);
|
||||
|
||||
string text = "abc121212xyz";
|
||||
// atoms = abc
|
||||
vector<int> atom_ids;
|
||||
vector<string> atoms;
|
||||
atoms.push_back("abc");
|
||||
FindAtomIndices(v.atoms, atoms, &atom_ids);
|
||||
vector<int> matching_regexps;
|
||||
v.f.AllMatches(text, atom_ids, &matching_regexps);
|
||||
EXPECT_EQ(1, matching_regexps.size());
|
||||
|
||||
text = "abc12312yyyzzz";
|
||||
atoms.clear();
|
||||
atoms.push_back("abc");
|
||||
atoms.push_back("yyy");
|
||||
atoms.push_back("yyyzzz");
|
||||
FindAtomIndices(v.atoms, atoms, &atom_ids);
|
||||
v.f.AllMatches(text, atom_ids, &matching_regexps);
|
||||
EXPECT_EQ(1, matching_regexps.size());
|
||||
|
||||
text = "abcd12yyy32yyyzzz";
|
||||
atoms.clear();
|
||||
atoms.push_back("abc");
|
||||
atoms.push_back("abcd");
|
||||
atoms.push_back("yyy");
|
||||
atoms.push_back("yyyzzz");
|
||||
FindAtomIndices(v.atoms, atoms, &atom_ids);
|
||||
LOG(INFO) << "S: " << atom_ids.size();
|
||||
for (int i = 0; i < atom_ids.size(); i++)
|
||||
LOG(INFO) << "i: " << i << " : " << atom_ids[i];
|
||||
v.f.AllMatches(text, atom_ids, &matching_regexps);
|
||||
EXPECT_EQ(2, matching_regexps.size());
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,76 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct PCRETest {
|
||||
const char* regexp;
|
||||
bool should_match;
|
||||
};
|
||||
|
||||
static PCRETest tests[] = {
|
||||
// Most things should behave exactly.
|
||||
{ "abc", true },
|
||||
{ "(a|b)c", true },
|
||||
{ "(a*|b)c", true },
|
||||
{ "(a|b*)c", true },
|
||||
{ "a(b|c)d", true },
|
||||
{ "a(()|())c", true },
|
||||
{ "ab*c", true },
|
||||
{ "ab+c", true },
|
||||
{ "a(b*|c*)d", true },
|
||||
{ "\\W", true },
|
||||
{ "\\W{1,2}", true },
|
||||
{ "\\d", true },
|
||||
|
||||
// Check that repeated empty strings do not.
|
||||
{ "(a*)*", false },
|
||||
{ "x(a*)*y", false },
|
||||
{ "(a*)+", false },
|
||||
{ "(a+)*", true },
|
||||
{ "(a+)+", true },
|
||||
{ "(a+)+", true },
|
||||
|
||||
// \v is the only character class that shouldn't.
|
||||
{ "\\b", true },
|
||||
{ "\\v", false },
|
||||
{ "\\d", true },
|
||||
|
||||
// The handling of ^ in multi-line mode is different, as is
|
||||
// the handling of $ in single-line mode. (Both involve
|
||||
// boundary cases if the string ends with \n.)
|
||||
{ "\\A", true },
|
||||
{ "\\z", true },
|
||||
{ "(?m)^", false },
|
||||
{ "(?m)$", true },
|
||||
{ "(?-m)^", true },
|
||||
{ "(?-m)$", false }, // In PCRE, == \Z
|
||||
{ "(?m)\\A", true },
|
||||
{ "(?m)\\z", true },
|
||||
{ "(?-m)\\A", true },
|
||||
{ "(?-m)\\z", true },
|
||||
};
|
||||
|
||||
TEST(MimicsPCRE, SimpleTests) {
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
const PCRETest& t = tests[i];
|
||||
for (int j = 0; j < 2; j++) {
|
||||
Regexp::ParseFlags flags = Regexp::LikePerl;
|
||||
if (j == 0)
|
||||
flags = flags | Regexp::Latin1;
|
||||
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
|
||||
CHECK(re) << " " << t.regexp;
|
||||
CHECK_EQ(t.should_match, re->MimicsPCRE())
|
||||
<< " " << t.regexp << " "
|
||||
<< (j==0 ? "latin1" : "utf");
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,44 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Null walker. For benchmarking the walker itself.
|
||||
|
||||
class NullWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
NullWalker() { }
|
||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args);
|
||||
|
||||
bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "NullWalker::ShortVisit called";
|
||||
return a;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NullWalker);
|
||||
};
|
||||
|
||||
// Called after visiting re's children. child_args contains the return
|
||||
// value from each of the children's PostVisits (i.e., whether each child
|
||||
// can match an empty string). Returns whether this clause can match an
|
||||
// empty string.
|
||||
bool NullWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns whether re can match an empty string.
|
||||
void Regexp::NullWalk() {
|
||||
NullWalker w;
|
||||
w.Walk(this, false);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,376 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test parse.cc, dump.cc, and tostring.cc.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct Test {
|
||||
const char* regexp;
|
||||
const char* parse;
|
||||
};
|
||||
|
||||
static Test tests[] = {
|
||||
// Base cases
|
||||
{ "a", "lit{a}" },
|
||||
{ "a.", "cat{lit{a}dot{}}" },
|
||||
{ "a.b", "cat{lit{a}dot{}lit{b}}" },
|
||||
{ "ab", "str{ab}" },
|
||||
{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
|
||||
{ "abc", "str{abc}" },
|
||||
{ "a|^", "alt{lit{a}bol{}}" },
|
||||
{ "a|b", "cc{0x61-0x62}" },
|
||||
{ "(a)", "cap{lit{a}}" },
|
||||
{ "(a)|b", "alt{cap{lit{a}}lit{b}}" },
|
||||
{ "a*", "star{lit{a}}" },
|
||||
{ "a+", "plus{lit{a}}" },
|
||||
{ "a?", "que{lit{a}}" },
|
||||
{ "a{2}", "rep{2,2 lit{a}}" },
|
||||
{ "a{2,3}", "rep{2,3 lit{a}}" },
|
||||
{ "a{2,}", "rep{2,-1 lit{a}}" },
|
||||
{ "a*?", "nstar{lit{a}}" },
|
||||
{ "a+?", "nplus{lit{a}}" },
|
||||
{ "a??", "nque{lit{a}}" },
|
||||
{ "a{2}?", "nrep{2,2 lit{a}}" },
|
||||
{ "a{2,3}?", "nrep{2,3 lit{a}}" },
|
||||
{ "a{2,}?", "nrep{2,-1 lit{a}}" },
|
||||
{ "", "emp{}" },
|
||||
{ "|", "emp{}" }, // alt{emp{}emp{}} but got factored
|
||||
{ "|x|", "alt{emp{}lit{x}emp{}}" },
|
||||
{ ".", "dot{}" },
|
||||
{ "^", "bol{}" },
|
||||
{ "$", "eol{}" },
|
||||
{ "\\|", "lit{|}" },
|
||||
{ "\\(", "lit{(}" },
|
||||
{ "\\)", "lit{)}" },
|
||||
{ "\\*", "lit{*}" },
|
||||
{ "\\+", "lit{+}" },
|
||||
{ "\\?", "lit{?}" },
|
||||
{ "{", "lit{{}" },
|
||||
{ "}", "lit{}}" },
|
||||
{ "\\.", "lit{.}" },
|
||||
{ "\\^", "lit{^}" },
|
||||
{ "\\$", "lit{$}" },
|
||||
{ "\\\\", "lit{\\}" },
|
||||
{ "[ace]", "cc{0x61 0x63 0x65}" },
|
||||
{ "[abc]", "cc{0x61-0x63}" },
|
||||
{ "[a-z]", "cc{0x61-0x7a}" },
|
||||
{ "[a]", "lit{a}" },
|
||||
{ "\\-", "lit{-}" },
|
||||
{ "-", "lit{-}" },
|
||||
{ "\\_", "lit{_}" },
|
||||
|
||||
// Posix and Perl extensions
|
||||
{ "[[:lower:]]", "cc{0x61-0x7a}" },
|
||||
{ "[a-z]", "cc{0x61-0x7a}" },
|
||||
{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
|
||||
{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
|
||||
{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
||||
{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
||||
{ "\\d", "cc{0x30-0x39}" },
|
||||
{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
|
||||
{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
|
||||
{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
|
||||
{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
|
||||
{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
|
||||
{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
|
||||
{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
|
||||
{ "\\C", "byte{}" },
|
||||
|
||||
// Unicode, negatives, and a double negative.
|
||||
{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
|
||||
{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
|
||||
{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
|
||||
{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },
|
||||
|
||||
// More interesting regular expressions.
|
||||
{ "a{,2}", "str{a{,2}}" },
|
||||
{ "\\.\\^\\$\\\\", "str{.^$\\}" },
|
||||
{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
|
||||
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
|
||||
{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
|
||||
{ "a*{", "cat{star{lit{a}}lit{{}}" },
|
||||
|
||||
// Test precedences
|
||||
{ "(?:ab)*", "star{str{ab}}" },
|
||||
{ "(ab)*", "star{cap{str{ab}}}" },
|
||||
{ "ab|cd", "alt{str{ab}str{cd}}" },
|
||||
{ "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
|
||||
|
||||
// Test flattening.
|
||||
{ "(?:a)", "lit{a}" },
|
||||
{ "(?:ab)(?:cd)", "str{abcd}" },
|
||||
{ "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
|
||||
{ "a|.", "dot{}" },
|
||||
{ ".|a", "dot{}" },
|
||||
|
||||
// Test Perl quoted literals
|
||||
{ "\\Q+|*?{[\\E", "str{+|*?{[}" },
|
||||
{ "\\Q+\\E+", "plus{lit{+}}" },
|
||||
{ "\\Q\\\\E", "lit{\\}" },
|
||||
{ "\\Q\\\\\\E", "str{\\\\}" },
|
||||
|
||||
// Test Perl \A and \z
|
||||
{ "(?m)^", "bol{}" },
|
||||
{ "(?m)$", "eol{}" },
|
||||
{ "(?-m)^", "bot{}" },
|
||||
{ "(?-m)$", "eot{}" },
|
||||
{ "(?m)\\A", "bot{}" },
|
||||
{ "(?m)\\z", "eot{\\z}" },
|
||||
{ "(?-m)\\A", "bot{}" },
|
||||
{ "(?-m)\\z", "eot{\\z}" },
|
||||
|
||||
// Test named captures
|
||||
{ "(?P<name>a)", "cap{name:lit{a}}" },
|
||||
|
||||
// Case-folded literals
|
||||
{ "[Aa]", "litfold{a}" },
|
||||
|
||||
// Strings
|
||||
{ "abcde", "str{abcde}" },
|
||||
{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
|
||||
};
|
||||
|
||||
static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
|
||||
Regexp::PerlX |
|
||||
Regexp::PerlClasses |
|
||||
Regexp::UnicodeGroups;
|
||||
|
||||
bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
|
||||
return Regexp::Equal(a, b);
|
||||
}
|
||||
|
||||
void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
|
||||
const string& title) {
|
||||
Regexp** re = new Regexp*[ntests];
|
||||
for (int i = 0; i < ntests; i++) {
|
||||
RegexpStatus status;
|
||||
re[i] = Regexp::Parse(tests[i].regexp, flags, &status);
|
||||
CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
|
||||
<< status.Text();
|
||||
string s = re[i]->Dump();
|
||||
EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp
|
||||
<< "\nparse: " << tests[i].parse << " s: " << s;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ntests; i++) {
|
||||
for (int j = 0; j < ntests; j++) {
|
||||
EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
|
||||
RegexpEqualTestingOnly(re[i], re[j]))
|
||||
<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ntests; i++)
|
||||
re[i]->Decref();
|
||||
delete[] re;
|
||||
}
|
||||
|
||||
// Test that regexps parse to expected structures.
|
||||
TEST(TestParse, SimpleRegexps) {
|
||||
TestParse(tests, arraysize(tests), kTestFlags, "simple");
|
||||
}
|
||||
|
||||
Test foldcase_tests[] = {
|
||||
{ "AbCdE", "strfold{abcde}" },
|
||||
{ "[Aa]", "litfold{a}" },
|
||||
{ "a", "litfold{a}" },
|
||||
|
||||
// 0x17F is an old English long s (looks like an f) and folds to s.
|
||||
// 0x212A is the Kelvin symbol and folds to k.
|
||||
{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
|
||||
{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
|
||||
};
|
||||
|
||||
// Test that parsing with FoldCase works.
|
||||
TEST(TestParse, FoldCase) {
|
||||
TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
|
||||
}
|
||||
|
||||
Test literal_tests[] = {
|
||||
{ "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
|
||||
};
|
||||
|
||||
// Test that parsing with Literal works.
|
||||
TEST(TestParse, Literal) {
|
||||
TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
|
||||
}
|
||||
|
||||
Test matchnl_tests[] = {
|
||||
{ ".", "dot{}" },
|
||||
{ "\n", "lit{\n}" },
|
||||
{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
|
||||
{ "[a\\n]", "cc{0xa 0x61}" },
|
||||
};
|
||||
|
||||
// Test that parsing with MatchNL works.
|
||||
// (Also tested above during simple cases.)
|
||||
TEST(TestParse, MatchNL) {
|
||||
TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
|
||||
}
|
||||
|
||||
Test nomatchnl_tests[] = {
|
||||
{ ".", "cc{0-0x9 0xb-0x10ffff}" },
|
||||
{ "\n", "lit{\n}" },
|
||||
{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
|
||||
{ "[a\\n]", "cc{0xa 0x61}" },
|
||||
};
|
||||
|
||||
// Test that parsing without MatchNL works.
|
||||
TEST(TestParse, NoMatchNL) {
|
||||
TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
|
||||
}
|
||||
|
||||
Test prefix_tests[] = {
|
||||
{ "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
|
||||
{ "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
|
||||
{ "abc|abd|aef|bcx|bcy",
|
||||
"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
|
||||
"cat{str{bc}cc{0x78-0x79}}}" },
|
||||
{ "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
|
||||
{ "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
|
||||
{ "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
|
||||
{ "(?:xx|yy)c|(?:xx|yy)d",
|
||||
"cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
|
||||
{ "x{2}|x{2}[0-9]",
|
||||
"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
|
||||
{ "x{2}y|x{2}[0-9]y",
|
||||
"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
|
||||
};
|
||||
|
||||
// Test that prefix factoring works.
|
||||
TEST(TestParse, Prefix) {
|
||||
TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
|
||||
}
|
||||
|
||||
// Invalid regular expressions
|
||||
const char* badtests[] = {
|
||||
"(",
|
||||
")",
|
||||
"(a",
|
||||
"(a|b|",
|
||||
"(a|b",
|
||||
"[a-z",
|
||||
"([a-z)",
|
||||
"x{1001}",
|
||||
"\xff", // Invalid UTF-8
|
||||
"[\xff]",
|
||||
"[\\\xff]",
|
||||
"\\\xff",
|
||||
"(?P<name>a",
|
||||
"(?P<name>",
|
||||
"(?P<name",
|
||||
"(?P<x y>a)",
|
||||
"(?P<>a)",
|
||||
"[a-Z]",
|
||||
"(?i)[a-Z]",
|
||||
"a{100000}",
|
||||
"a{100000,}",
|
||||
};
|
||||
|
||||
// Valid in Perl, bad in POSIX
|
||||
const char* only_perl[] = {
|
||||
"[a-b-c]",
|
||||
"\\Qabc\\E",
|
||||
"\\Q*+?{[\\E",
|
||||
"\\Q\\\\E",
|
||||
"\\Q\\\\\\E",
|
||||
"\\Q\\\\\\\\E",
|
||||
"\\Q\\\\\\\\\\E",
|
||||
"(?:a)",
|
||||
"(?P<name>a)",
|
||||
};
|
||||
|
||||
// Valid in POSIX, bad in Perl.
|
||||
const char* only_posix[] = {
|
||||
"a++",
|
||||
"a**",
|
||||
"a?*",
|
||||
"a+*",
|
||||
"a{1}*",
|
||||
};
|
||||
|
||||
// Test that parser rejects bad regexps.
|
||||
TEST(TestParse, InvalidRegexps) {
|
||||
for (int i = 0; i < arraysize(badtests); i++) {
|
||||
CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
|
||||
<< " " << badtests[i];
|
||||
CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
|
||||
<< " " << badtests[i];
|
||||
}
|
||||
for (int i = 0; i < arraysize(only_posix); i++) {
|
||||
CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
|
||||
<< " " << only_posix[i];
|
||||
Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
|
||||
CHECK(re) << " " << only_posix[i];
|
||||
re->Decref();
|
||||
}
|
||||
for (int i = 0; i < arraysize(only_perl); i++) {
|
||||
CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
|
||||
<< " " << only_perl[i];
|
||||
Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
|
||||
CHECK(re) << " " << only_perl[i];
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
// Test that ToString produces original regexp or equivalent one.
|
||||
TEST(TestToString, EquivalentParse) {
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
RegexpStatus status;
|
||||
Regexp* re = Regexp::Parse(tests[i].regexp, kTestFlags, &status);
|
||||
CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
|
||||
string s = re->Dump();
|
||||
EXPECT_EQ(string(tests[i].parse), s);
|
||||
string t = re->ToString();
|
||||
if (t != tests[i].regexp) {
|
||||
// If ToString didn't return the original regexp,
|
||||
// it must have found one with fewer parens.
|
||||
// Unfortunately we can't check the length here, because
|
||||
// ToString produces "\\{" for a literal brace,
|
||||
// but "{" is a shorter equivalent.
|
||||
// CHECK_LT(t.size(), strlen(tests[i].regexp))
|
||||
// << " t=" << t << " regexp=" << tests[i].regexp;
|
||||
|
||||
// Test that if we parse the new regexp we get the same structure.
|
||||
Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
|
||||
CHECK(nre != NULL) << " reparse " << t << " " << status.Text();
|
||||
string ss = nre->Dump();
|
||||
string tt = nre->ToString();
|
||||
if (s != ss || t != tt)
|
||||
LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
|
||||
EXPECT_EQ(s, ss);
|
||||
EXPECT_EQ(t, tt);
|
||||
nre->Decref();
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
// Test that capture error args are correct.
|
||||
TEST(NamedCaptures, ErrorArgs) {
|
||||
RegexpStatus status;
|
||||
Regexp* re;
|
||||
|
||||
re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
|
||||
EXPECT_EQ(status.error_arg(), "(?P<name");
|
||||
|
||||
re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
|
||||
EXPECT_TRUE(re == NULL);
|
||||
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
|
||||
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,240 @@
|
|||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test that C++ strings are compared as uint8s, not int8s.
|
||||
// PossibleMatchRange doesn't depend on this, but callers probably will.
|
||||
TEST(CplusplusStrings, EightBit) {
|
||||
string s = "\x70";
|
||||
string t = "\xA0";
|
||||
EXPECT_LT(s, t);
|
||||
}
|
||||
|
||||
struct PrefixTest {
|
||||
const char* regexp;
|
||||
int maxlen;
|
||||
const char* min;
|
||||
const char* max;
|
||||
};
|
||||
|
||||
static PrefixTest tests[] = {
|
||||
{ "", 10, "", "", },
|
||||
{ "Abcdef", 10, "Abcdef", "Abcdef" },
|
||||
{ "abc(def|ghi)", 10, "abcdef", "abcghi" },
|
||||
{ "a+hello", 10, "aa", "ahello" },
|
||||
{ "a*hello", 10, "a", "hello" },
|
||||
{ "def|abc", 10, "abc", "def" },
|
||||
{ "a(b)(c)[d]", 10, "abcd", "abcd" },
|
||||
{ "ab(cab|cat)", 10, "abcab", "abcat" },
|
||||
{ "ab(cab|ca)x", 10, "abcabx", "abcax" },
|
||||
{ "(ab|x)(c|de)", 10, "abc", "xde" },
|
||||
{ "(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "[^\\s\\S]", 10, "", "" },
|
||||
{ "(abc)+", 5, "abc", "abcac" },
|
||||
{ "(abc)+", 2, "ab", "ac" },
|
||||
{ "(abc)+", 1, "a", "b" },
|
||||
{ "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
|
||||
{ "a*", 10, "", "ab" },
|
||||
|
||||
{ "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
|
||||
{ "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
|
||||
{ "(?i)a+hello", 10, "AA", "ahello" },
|
||||
{ "(?i)a*hello", 10, "A", "hello" },
|
||||
{ "(?i)def|abc", 10, "ABC", "def" },
|
||||
{ "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
|
||||
{ "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
|
||||
{ "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
|
||||
{ "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
|
||||
{ "(?i)(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "(?i)[^\\s\\S]", 10, "", "" },
|
||||
{ "(?i)(abc)+", 5, "ABC", "abcac" },
|
||||
{ "(?i)(abc)+", 2, "AB", "ac" },
|
||||
{ "(?i)(abc)+", 1, "A", "b" },
|
||||
{ "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
|
||||
{ "(?i)a*", 10, "", "ab" },
|
||||
{ "(?i)A*", 10, "", "ab" },
|
||||
|
||||
{ "\\AAbcdef", 10, "Abcdef", "Abcdef" },
|
||||
{ "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
|
||||
{ "\\Aa+hello", 10, "aa", "ahello" },
|
||||
{ "\\Aa*hello", 10, "a", "hello" },
|
||||
{ "\\Adef|abc", 10, "abc", "def" },
|
||||
{ "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
|
||||
{ "\\Aab(cab|cat)", 10, "abcab", "abcat" },
|
||||
{ "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
|
||||
{ "\\A(ab|x)(c|de)", 10, "abc", "xde" },
|
||||
{ "\\A(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "\\A[^\\s\\S]", 10, "", "" },
|
||||
{ "\\A(abc)+", 5, "abc", "abcac" },
|
||||
{ "\\A(abc)+", 2, "ab", "ac" },
|
||||
{ "\\A(abc)+", 1, "a", "b" },
|
||||
{ "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
|
||||
{ "\\Aa*", 10, "", "ab" },
|
||||
|
||||
{ "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
|
||||
{ "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
|
||||
{ "(?i)\\Aa+hello", 10, "AA", "ahello" },
|
||||
{ "(?i)\\Aa*hello", 10, "A", "hello" },
|
||||
{ "(?i)\\Adef|abc", 10, "ABC", "def" },
|
||||
{ "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
|
||||
{ "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
|
||||
{ "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
|
||||
{ "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
|
||||
{ "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
|
||||
{ "(?i)\\A[^\\s\\S]", 10, "", "" },
|
||||
{ "(?i)\\A(abc)+", 5, "ABC", "abcac" },
|
||||
{ "(?i)\\A(abc)+", 2, "AB", "ac" },
|
||||
{ "(?i)\\A(abc)+", 1, "A", "b" },
|
||||
{ "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
|
||||
{ "(?i)\\Aa*", 10, "", "ab" },
|
||||
{ "(?i)\\AA*", 10, "", "ab" },
|
||||
};
|
||||
|
||||
TEST(PossibleMatchRange, HandWritten) {
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
for (int j = 0; j < 2; j++) {
|
||||
const PrefixTest& t = tests[i];
|
||||
string min, max;
|
||||
if (j == 0) {
|
||||
LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
CHECK(prog);
|
||||
CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
|
||||
<< " " << t.regexp;
|
||||
delete prog;
|
||||
re->Decref();
|
||||
} else {
|
||||
CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
|
||||
}
|
||||
EXPECT_EQ(t.min, min) << t.regexp;
|
||||
EXPECT_EQ(t.max, max) << t.regexp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test cases where PossibleMatchRange should return false.
|
||||
TEST(PossibleMatchRange, Failures) {
|
||||
string min, max;
|
||||
|
||||
// Fails because no room to write max.
|
||||
EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
|
||||
|
||||
// Fails because there is no max -- any non-empty string matches
|
||||
// or begins a match. Have to use Latin-1 input, because there
|
||||
// are no valid UTF-8 strings beginning with byte 0xFF.
|
||||
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
EXPECT_FALSE(RE2(".*", RE2::Latin1).
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
EXPECT_FALSE(RE2("\\C*").
|
||||
PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
|
||||
// Fails because it's a malformed regexp.
|
||||
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
|
||||
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
|
||||
}
|
||||
|
||||
// Exhaustive test: generate all regexps within parameters,
|
||||
// then generate all strings of a given length over a given alphabet,
|
||||
// then check that the prefix information agrees with whether
|
||||
// the regexp matches each of the strings.
|
||||
class PossibleMatchTester : public RegexpGenerator {
|
||||
public:
|
||||
PossibleMatchTester(int maxatoms,
|
||||
int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen,
|
||||
const vector<string>& stralphabet)
|
||||
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
|
||||
strgen_(maxstrlen, stralphabet),
|
||||
regexps_(0), tests_(0) { }
|
||||
|
||||
int regexps() { return regexps_; }
|
||||
int tests() { return tests_; }
|
||||
|
||||
// Needed for RegexpGenerator interface.
|
||||
void HandleRegexp(const string& regexp);
|
||||
|
||||
private:
|
||||
StringGenerator strgen_;
|
||||
|
||||
int regexps_; // Number of HandleRegexp calls
|
||||
int tests_; // Number of regexp tests.
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
|
||||
};
|
||||
|
||||
// Processes a single generated regexp.
|
||||
// Checks that all accepted strings agree with the prefix range.
|
||||
void PossibleMatchTester::HandleRegexp(const string& regexp) {
|
||||
regexps_++;
|
||||
|
||||
VLOG(3) << CEscape(regexp);
|
||||
|
||||
RE2 re(regexp, RE2::Latin1);
|
||||
CHECK_EQ(re.error(), "");
|
||||
|
||||
string min, max;
|
||||
if(!re.PossibleMatchRange(&min, &max, 10)) {
|
||||
// There's no good max for "\\C*". Can't use strcmp
|
||||
// because sometimes it gets embedded in more
|
||||
// complicated expressions.
|
||||
if(strstr(regexp.c_str(), "\\C*"))
|
||||
return;
|
||||
LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
|
||||
}
|
||||
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext()) {
|
||||
const StringPiece& s = strgen_.Next();
|
||||
tests_++;
|
||||
if (!RE2::FullMatch(s, re))
|
||||
continue;
|
||||
CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
|
||||
CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PossibleMatchRange, Exhaustive) {
|
||||
int natom = 3;
|
||||
int noperator = 3;
|
||||
int stringlen = 5;
|
||||
if (DEBUG_MODE) {
|
||||
natom = 2;
|
||||
noperator = 3;
|
||||
stringlen = 3;
|
||||
}
|
||||
PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
stringlen, Explode("ab4"));
|
||||
t.Generate();
|
||||
LOG(INFO) << t.regexps() << " regexps, "
|
||||
<< t.tests() << " tests";
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,95 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Random testing of regular expression matching.
|
||||
|
||||
#include <stdio.h>
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
DEFINE_int32(regexpseed, 404, "Random regexp seed.");
|
||||
DEFINE_int32(regexpcount, 100, "How many random regexps to generate.");
|
||||
DEFINE_int32(stringseed, 200, "Random string seed.");
|
||||
DEFINE_int32(stringcount, 100, "How many random strings to generate.");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Runs a random test on the given parameters.
|
||||
// (Always uses the same random seeds for reproducibility.
|
||||
// Can give different seeds on command line.)
|
||||
static void RandomTest(int maxatoms, int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen, const vector<string>& stralphabet,
|
||||
const string& wrapper) {
|
||||
// Limit to smaller test cases in debug mode,
|
||||
// because everything is so much slower.
|
||||
if (DEBUG_MODE) {
|
||||
maxatoms--;
|
||||
maxops--;
|
||||
maxstrlen /= 2;
|
||||
}
|
||||
|
||||
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
|
||||
maxstrlen, stralphabet, wrapper, "");
|
||||
t.RandomStrings(FLAGS_stringseed, FLAGS_stringcount);
|
||||
t.GenerateRandom(FLAGS_regexpseed, FLAGS_regexpcount);
|
||||
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
|
||||
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
|
||||
EXPECT_EQ(0, t.failures());
|
||||
}
|
||||
|
||||
// Tests random small regexps involving literals and egrep operators.
|
||||
TEST(Random, SmallEgrepLiterals) {
|
||||
RandomTest(5, 5, Explode("abc."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random bigger regexps involving literals and egrep operators.
|
||||
TEST(Random, BigEgrepLiterals) {
|
||||
RandomTest(10, 10, Explode("abc."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random small regexps involving literals, capturing parens,
|
||||
// and egrep operators.
|
||||
TEST(Random, SmallEgrepCaptures) {
|
||||
RandomTest(5, 5, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random bigger regexps involving literals, capturing parens,
|
||||
// and egrep operators.
|
||||
TEST(Random, BigEgrepCaptures) {
|
||||
RandomTest(10, 10, Split(" ", "a (b) ."), RegexpGenerator::EgrepOps(),
|
||||
15, Explode("abc"),
|
||||
"");
|
||||
}
|
||||
|
||||
// Tests random large complicated expressions, using all the possible
|
||||
// operators, some literals, some parenthesized literals, and predefined
|
||||
// character classes like \d. (Adding larger character classes would
|
||||
// make for too many possibilities.)
|
||||
TEST(Random, Complicated) {
|
||||
vector<string> ops = Split(" ",
|
||||
"%s%s %s|%s %s* %s*? %s+ %s+? %s? %s?? "
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} %s{1,2} "
|
||||
"%s{2} %s{2,} %s{3,4} %s{4,5}");
|
||||
|
||||
// Use (?:\b) and (?:\B) instead of \b and \B,
|
||||
// because PCRE rejects \b* but accepts (?:\b)*.
|
||||
// Ditto ^ and $.
|
||||
vector<string> atoms = Split(" ",
|
||||
". (?:^) (?:$) \\a \\f \\n \\r \\t \\v "
|
||||
"\\d \\D \\s \\S \\w \\W (?:\\b) (?:\\B) "
|
||||
"a (a) b c - \\\\");
|
||||
vector<string> alphabet = Explode("abc123\001\002\003\t\r\n\v\f\a");
|
||||
RandomTest(10, 10, atoms, ops, 20, alphabet, "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
// Copyright 2005 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// This tests to make sure numbers are parsed from strings
|
||||
// correctly.
|
||||
// Todo: Expand the test to validate strings parsed to the other types
|
||||
// supported by RE2::Arg class
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct SuccessTable {
|
||||
const char * value_string;
|
||||
int64 value;
|
||||
bool success[6];
|
||||
};
|
||||
|
||||
// Test boundary cases for different integral sizes.
|
||||
// Specifically I want to make sure that values outside the boundries
|
||||
// of an integral type will fail and that negative numbers will fail
|
||||
// for unsigned types. The following table contains the boundaries for
|
||||
// the various integral types and has entries for whether or not each
|
||||
// type can contain the given value.
|
||||
const SuccessTable kSuccessTable[] = {
|
||||
// string integer value short ushort int uint int64 uint64
|
||||
// 0 to 2^7-1
|
||||
{ "0", 0, { true, true, true, true, true, true }},
|
||||
{ "127", 127, { true, true, true, true, true, true }},
|
||||
|
||||
// -1 to -2^7
|
||||
{ "-1", -1, { true, false, true, false, true, false }},
|
||||
{ "-128", -128, { true, false, true, false, true, false }},
|
||||
|
||||
// 2^7 to 2^8-1
|
||||
{ "128", 128, { true, true, true, true, true, true }},
|
||||
{ "255", 255, { true, true, true, true, true, true }},
|
||||
|
||||
// 2^8 to 2^15-1
|
||||
{ "256", 256, { true, true, true, true, true, true }},
|
||||
{ "32767", 32767, { true, true, true, true, true, true }},
|
||||
|
||||
// -2^7-1 to -2^15
|
||||
{ "-129", -129, { true, false, true, false, true, false }},
|
||||
{ "-32768", -32768, { true, false, true, false, true, false }},
|
||||
|
||||
// 2^15 to 2^16-1
|
||||
{ "32768", 32768, { false, true, true, true, true, true }},
|
||||
{ "65535", 65535, { false, true, true, true, true, true }},
|
||||
|
||||
// 2^16 to 2^31-1
|
||||
{ "65536", 65536, { false, false, true, true, true, true }},
|
||||
{ "2147483647", 2147483647, { false, false, true, true, true, true }},
|
||||
|
||||
// -2^15-1 to -2^31
|
||||
{ "-32769", -32769, { false, false, true, false, true, false }},
|
||||
{ "-2147483648",
|
||||
0xFFFFFFFF80000000LL, { false, false, true, false, true, false }},
|
||||
|
||||
// 2^31 to 2^32-1
|
||||
{ "2147483648", 2147483648U, { false, false, false, true, true, true }},
|
||||
{ "4294967295", 4294967295U, { false, false, false, true, true, true }},
|
||||
|
||||
// 2^32 to 2^63-1
|
||||
{ "4294967296", 4294967296LL, { false, false, false, false, true, true }},
|
||||
{ "9223372036854775807",
|
||||
9223372036854775807LL, { false, false, false, false, true, true }},
|
||||
|
||||
// -2^31-1 to -2^63
|
||||
{ "-2147483649", -2147483649LL, { false, false, false, false, true, false }},
|
||||
{ "-9223372036854775808",
|
||||
0x8000000000000000LL, { false, false, false, false, true, false }},
|
||||
|
||||
// 2^63 to 2^64-1
|
||||
{ "9223372036854775808",
|
||||
9223372036854775808ULL, { false, false, false, false, false, true }},
|
||||
{ "18446744073709551615",
|
||||
18446744073709551615ULL, { false, false, false, false, false, true }},
|
||||
|
||||
// >= 2^64
|
||||
{ "18446744073709551616", 0, { false, false, false, false, false, false }},
|
||||
};
|
||||
|
||||
const int kNumStrings = ARRAYSIZE(kSuccessTable);
|
||||
|
||||
// It's ugly to use a macro, but we apparently can't use the ASSERT_TRUE_M
|
||||
// macro outside of a TEST block and this seems to be the only way to
|
||||
// avoid code duplication. I can also pull off a couple nice tricks
|
||||
// using concatenation for the type I'm checking against.
|
||||
#define PARSE_FOR_TYPE(type, column) { \
|
||||
type r; \
|
||||
for ( int i = 0; i < kNumStrings; ++i ) { \
|
||||
RE2::Arg arg(&r); \
|
||||
const char* const p = kSuccessTable[i].value_string; \
|
||||
bool retval = arg.Parse(p, strlen(p)); \
|
||||
bool success = kSuccessTable[i].success[column]; \
|
||||
ASSERT_TRUE_M(retval == success, \
|
||||
StringPrintf("Parsing '%s' for type " #type " should return %d", \
|
||||
p, success).c_str()); \
|
||||
if ( success ) { \
|
||||
ASSERT_EQUALS(r, kSuccessTable[i].value); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
TEST(REArgTest, Int16Test) {
|
||||
PARSE_FOR_TYPE(int16, 0);
|
||||
}
|
||||
|
||||
TEST(REArgTest, Uint16Test) {
|
||||
PARSE_FOR_TYPE(uint16, 1);
|
||||
}
|
||||
|
||||
TEST(REArgTest, IntTest) {
|
||||
PARSE_FOR_TYPE(int, 2);
|
||||
}
|
||||
|
||||
TEST(REArgTest, UInt32Test) {
|
||||
PARSE_FOR_TYPE(uint32, 3);
|
||||
}
|
||||
|
||||
TEST(REArgTest, Iint64Test) {
|
||||
PARSE_FOR_TYPE(int64, 4);
|
||||
}
|
||||
|
||||
TEST(REArgTest, Uint64Test) {
|
||||
PARSE_FOR_TYPE(uint64, 5);
|
||||
}
|
||||
|
||||
} // namespace re2
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,264 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression generator: generates all possible
|
||||
// regular expressions within parameters (see regexp_generator.h for details).
|
||||
|
||||
// The regexp generator first generates a sequence of commands in a simple
|
||||
// postfix language. Each command in the language is a string,
|
||||
// like "a" or "%s*" or "%s|%s".
|
||||
//
|
||||
// To evaluate a command, enough arguments are popped from the value stack to
|
||||
// plug into the %s slots. Then the result is pushed onto the stack.
|
||||
// For example, the command sequence
|
||||
// a b %s%s c
|
||||
// results in the stack
|
||||
// ab c
|
||||
//
|
||||
// GeneratePostfix generates all possible command sequences.
|
||||
// Then RunPostfix turns each sequence into a regular expression
|
||||
// and passes the regexp to HandleRegexp.
|
||||
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include <stack>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Returns a vector of the egrep regexp operators.
|
||||
const vector<string>& RegexpGenerator::EgrepOps() {
|
||||
static const char *ops[] = {
|
||||
"%s%s",
|
||||
"%s|%s",
|
||||
"%s*",
|
||||
"%s+",
|
||||
"%s?",
|
||||
"%s\\C*",
|
||||
};
|
||||
static vector<string> v(ops, ops + arraysize(ops));
|
||||
return v;
|
||||
}
|
||||
|
||||
RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
|
||||
const vector<string>& atoms,
|
||||
const vector<string>& ops)
|
||||
: maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
|
||||
// Degenerate case.
|
||||
if (atoms_.size() == 0)
|
||||
maxatoms_ = 0;
|
||||
if (ops_.size() == 0)
|
||||
maxops_ = 0;
|
||||
}
|
||||
|
||||
// Generates all possible regular expressions (within the parameters),
|
||||
// calling HandleRegexp for each one.
|
||||
void RegexpGenerator::Generate() {
|
||||
vector<string> postfix;
|
||||
GeneratePostfix(&postfix, 0, 0, 0);
|
||||
}
|
||||
|
||||
// Generates random regular expressions, calling HandleRegexp for each one.
|
||||
void RegexpGenerator::GenerateRandom(int32 seed, int n) {
|
||||
ACMRandom acm(seed);
|
||||
acm_ = &acm;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
vector<string> postfix;
|
||||
GenerateRandomPostfix(&postfix, 0, 0, 0);
|
||||
}
|
||||
|
||||
acm_ = NULL;
|
||||
}
|
||||
|
||||
// Counts and returns the number of occurrences of "%s" in s.
|
||||
static int CountArgs(const string& s) {
|
||||
const char *p = s.c_str();
|
||||
int n = 0;
|
||||
while ((p = strstr(p, "%s")) != NULL) {
|
||||
p += 2;
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
// Generates all possible postfix command sequences.
|
||||
// Each sequence is handed off to RunPostfix to generate a regular expression.
|
||||
// The arguments are:
|
||||
// post: the current postfix sequence
|
||||
// nstk: the number of elements that would be on the stack after executing
|
||||
// the sequence
|
||||
// ops: the number of operators used in the sequence
|
||||
// atoms: the number of atoms used in the sequence
|
||||
// For example, if post were ["a", "b", "%s%s", "c"],
|
||||
// then nstk = 2, ops = 1, atoms = 3.
|
||||
//
|
||||
// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
|
||||
//
|
||||
void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
|
||||
int ops, int atoms) {
|
||||
if (nstk == 1)
|
||||
RunPostfix(*post);
|
||||
|
||||
// Early out: if used too many operators or can't
|
||||
// get back down to a single expression on the stack
|
||||
// using binary operators, give up.
|
||||
if (ops + nstk - 1 > maxops_)
|
||||
return;
|
||||
|
||||
// Add atoms if there is room.
|
||||
if (atoms < maxatoms_) {
|
||||
for (int i = 0; i < atoms_.size(); i++) {
|
||||
post->push_back(atoms_[i]);
|
||||
GeneratePostfix(post, nstk + 1, ops, atoms + 1);
|
||||
post->pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
// Add operators if there are enough arguments.
|
||||
if (ops < maxops_) {
|
||||
for (int i = 0; i < ops_.size(); i++) {
|
||||
const string& fmt = ops_[i];
|
||||
int nargs = CountArgs(fmt);
|
||||
if (nargs <= nstk) {
|
||||
post->push_back(fmt);
|
||||
GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
|
||||
post->pop_back();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generates a random postfix command sequence.
|
||||
// Stops and returns true once a single sequence has been generated.
|
||||
bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
|
||||
int ops, int atoms) {
|
||||
for (;;) {
|
||||
// Stop if we get to a single element, but only sometimes.
|
||||
if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
|
||||
RunPostfix(*post);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Early out: if used too many operators or can't
|
||||
// get back down to a single expression on the stack
|
||||
// using binary operators, give up.
|
||||
if (ops + nstk - 1 > maxops_)
|
||||
return false;
|
||||
|
||||
// Add operators if there are enough arguments.
|
||||
if (ops < maxops_ && acm_->Uniform(2) == 0) {
|
||||
const string& fmt = ops_[acm_->Uniform(ops_.size())];
|
||||
int nargs = CountArgs(fmt);
|
||||
if (nargs <= nstk) {
|
||||
post->push_back(fmt);
|
||||
bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
|
||||
ops + 1, atoms);
|
||||
post->pop_back();
|
||||
if (ret)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Add atoms if there is room.
|
||||
if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
|
||||
post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
|
||||
bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
|
||||
post->pop_back();
|
||||
if (ret)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Interprets the postfix command sequence to create a regular expression
|
||||
// passed to HandleRegexp. The results of operators like %s|%s are wrapped
|
||||
// in (?: ) to avoid needing to maintain a precedence table.
|
||||
void RegexpGenerator::RunPostfix(const vector<string>& post) {
|
||||
stack<string> regexps;
|
||||
for (int i = 0; i < post.size(); i++) {
|
||||
switch (CountArgs(post[i])) {
|
||||
default:
|
||||
LOG(FATAL) << "Bad operator: " << post[i];
|
||||
case 0:
|
||||
regexps.push(post[i]);
|
||||
break;
|
||||
case 1: {
|
||||
string a = regexps.top();
|
||||
regexps.pop();
|
||||
regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
string b = regexps.top();
|
||||
regexps.pop();
|
||||
string a = regexps.top();
|
||||
regexps.pop();
|
||||
regexps.push("(?:" +
|
||||
StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
|
||||
")");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (regexps.size() != 1) {
|
||||
// Internal error - should never happen.
|
||||
printf("Bad regexp program:\n");
|
||||
for (int i = 0; i < post.size(); i++) {
|
||||
printf(" %s\n", CEscape(post[i]).c_str());
|
||||
}
|
||||
printf("Stack after running program:\n");
|
||||
while (!regexps.empty()) {
|
||||
printf(" %s\n", CEscape(regexps.top()).c_str());
|
||||
regexps.pop();
|
||||
}
|
||||
LOG(FATAL) << "Bad regexp program.";
|
||||
}
|
||||
|
||||
HandleRegexp(regexps.top());
|
||||
HandleRegexp("^(?:" + regexps.top() + ")$");
|
||||
HandleRegexp("^(?:" + regexps.top() + ")");
|
||||
HandleRegexp("(?:" + regexps.top() + ")$");
|
||||
}
|
||||
|
||||
// Split s into an vector of strings, one for each UTF-8 character.
|
||||
vector<string> Explode(const StringPiece& s) {
|
||||
vector<string> v;
|
||||
|
||||
for (const char *q = s.begin(); q < s.end(); ) {
|
||||
const char* p = q;
|
||||
Rune r;
|
||||
q += chartorune(&r, q);
|
||||
v.push_back(string(p, q - p));
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// Split string everywhere a substring is found, returning
|
||||
// vector of pieces.
|
||||
vector<string> Split(const StringPiece& sep, const StringPiece& s) {
|
||||
vector<string> v;
|
||||
|
||||
if (sep.size() == 0)
|
||||
return Explode(s);
|
||||
|
||||
const char *p = s.begin();
|
||||
for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
|
||||
if (StringPiece(q, sep.size()) == sep) {
|
||||
v.push_back(string(p, q - p));
|
||||
p = q + sep.size();
|
||||
q = p - 1; // -1 for ++ in loop
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (p < s.end())
|
||||
v.push_back(string(p, s.end() - p));
|
||||
return v;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression generator: generates all possible
|
||||
// regular expressions within given parameters (see below for details).
|
||||
|
||||
#ifndef RE2_TESTING_REGEXP_GENERATOR_H__
|
||||
#define RE2_TESTING_REGEXP_GENERATOR_H__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/random.h"
|
||||
#include "util/util.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Regular expression generator.
|
||||
//
|
||||
// Given a set of atom expressions like "a", "b", or "."
|
||||
// and operators like "%s*", generates all possible regular expressions
|
||||
// using at most maxbases base expressions and maxops operators.
|
||||
// For each such expression re, calls HandleRegexp(re).
|
||||
//
|
||||
// Callers are expected to subclass RegexpGenerator and provide HandleRegexp.
|
||||
//
|
||||
class RegexpGenerator {
|
||||
public:
|
||||
RegexpGenerator(int maxatoms, int maxops, const vector<string>& atoms,
|
||||
const vector<string>& ops);
|
||||
virtual ~RegexpGenerator() {}
|
||||
|
||||
// Generates all the regular expressions, calling HandleRegexp(re) for each.
|
||||
void Generate();
|
||||
|
||||
// Generates n random regular expressions, calling HandleRegexp(re) for each.
|
||||
void GenerateRandom(int32 seed, int n);
|
||||
|
||||
// Handles a regular expression. Must be provided by subclass.
|
||||
virtual void HandleRegexp(const string& regexp) = 0;
|
||||
|
||||
// The egrep regexp operators: * + ? | and concatenation.
|
||||
static const vector<string>& EgrepOps();
|
||||
|
||||
private:
|
||||
void RunPostfix(const vector<string>& post);
|
||||
void GeneratePostfix(vector<string>* post, int nstk, int ops, int lits);
|
||||
bool GenerateRandomPostfix(vector<string>* post, int nstk, int ops, int lits);
|
||||
|
||||
int maxatoms_; // Maximum number of atoms allowed in expr.
|
||||
int maxops_; // Maximum number of ops allowed in expr.
|
||||
vector<string> atoms_; // Possible atoms.
|
||||
vector<string> ops_; // Possible ops.
|
||||
ACMRandom* acm_; // Random generator.
|
||||
DISALLOW_EVIL_CONSTRUCTORS(RegexpGenerator);
|
||||
};
|
||||
|
||||
// Helpers for preparing arguments to RegexpGenerator constructor.
|
||||
|
||||
// Returns one string for each character in s.
|
||||
vector<string> Explode(const StringPiece& s);
|
||||
|
||||
// Splits string everywhere sep is found, returning
|
||||
// vector of pieces.
|
||||
vector<string> Split(const StringPiece& sep, const StringPiece& s);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_REGEXP_GENERATOR_H__
|
|
@ -0,0 +1,81 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test parse.cc, dump.cc, and tostring.cc.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test that overflowed ref counts work.
|
||||
TEST(Regexp, BigRef) {
|
||||
Regexp* re;
|
||||
re = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
|
||||
for (int i = 0; i < 100000; i++)
|
||||
re->Incref();
|
||||
for (int i = 0; i < 100000; i++)
|
||||
re->Decref();
|
||||
CHECK_EQ(re->Ref(), 1);
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Test that very large Concats work.
|
||||
// Depends on overflowed ref counts working.
|
||||
TEST(Regexp, BigConcat) {
|
||||
Regexp* x;
|
||||
x = Regexp::Parse("x", Regexp::NoParseFlags, NULL);
|
||||
vector<Regexp*> v(90000, x); // ToString bails out at 100000
|
||||
for (int i = 0; i < v.size(); i++)
|
||||
x->Incref();
|
||||
CHECK_EQ(x->Ref(), 1 + v.size()) << x->Ref();
|
||||
Regexp* re = Regexp::Concat(&v[0], v.size(), Regexp::NoParseFlags);
|
||||
CHECK_EQ(re->ToString(), string(v.size(), 'x'));
|
||||
re->Decref();
|
||||
CHECK_EQ(x->Ref(), 1) << x->Ref();
|
||||
x->Decref();
|
||||
}
|
||||
|
||||
TEST(Regexp, NamedCaptures) {
|
||||
Regexp* x;
|
||||
RegexpStatus status;
|
||||
x = Regexp::Parse(
|
||||
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
|
||||
EXPECT_TRUE(status.ok());
|
||||
EXPECT_EQ(4, x->NumCaptures());
|
||||
const map<string, int>* have = x->NamedCaptures();
|
||||
EXPECT_TRUE(have != NULL);
|
||||
EXPECT_EQ(2, have->size()); // there are only two named groups in
|
||||
// the regexp: 'g1' and 'g2'.
|
||||
map<string, int> want;
|
||||
want["g1"] = 1;
|
||||
want["g2"] = 3;
|
||||
EXPECT_EQ(want, *have);
|
||||
x->Decref();
|
||||
delete have;
|
||||
}
|
||||
|
||||
TEST(Regexp, CaptureNames) {
|
||||
Regexp* x;
|
||||
RegexpStatus status;
|
||||
x = Regexp::Parse(
|
||||
"(?P<g1>a+)|(e)(?P<g2>w*)+(?P<g1>b+)", Regexp::PerlX, &status);
|
||||
EXPECT_TRUE(status.ok());
|
||||
EXPECT_EQ(4, x->NumCaptures());
|
||||
const map<int, string>* have = x->CaptureNames();
|
||||
EXPECT_TRUE(have != NULL);
|
||||
EXPECT_EQ(3, have->size());
|
||||
map<int, string> want;
|
||||
want[1] = "g1";
|
||||
want[3] = "g2";
|
||||
want[4] = "g1";
|
||||
|
||||
EXPECT_EQ(want, *have);
|
||||
x->Decref();
|
||||
delete have;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,67 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct PrefixTest {
|
||||
const char* regexp;
|
||||
bool return_value;
|
||||
const char* prefix;
|
||||
bool foldcase;
|
||||
const char* suffix;
|
||||
};
|
||||
|
||||
static PrefixTest tests[] = {
|
||||
// If the regexp is missing a ^, there's no required prefix.
|
||||
{ "abc", false },
|
||||
{ "", false },
|
||||
{ "(?m)^", false },
|
||||
|
||||
// If the regexp immediately goes into
|
||||
// something not a literal match, there's no required prefix.
|
||||
{ "^(abc)", false },
|
||||
{ "^a*", false },
|
||||
|
||||
// Otherwise, it should work.
|
||||
{ "^abc$", true, "abc", false, "(?-m:$)" },
|
||||
{ "^abc", "true", "abc", false, "" },
|
||||
{ "^(?i)abc", true, "abc", true, "" },
|
||||
{ "^abcd*", true, "abc", false, "d*" },
|
||||
{ "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
|
||||
{ "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
|
||||
{ "^☺abc", true, "☺abc", false, "" },
|
||||
};
|
||||
|
||||
TEST(RequiredPrefix, SimpleTests) {
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
const PrefixTest& t = tests[i];
|
||||
for (int j = 0; j < 2; j++) {
|
||||
Regexp::ParseFlags flags = Regexp::LikePerl;
|
||||
if (j == 0)
|
||||
flags = flags | Regexp::Latin1;
|
||||
Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
|
||||
CHECK(re) << " " << t.regexp;
|
||||
string p;
|
||||
bool f = false;
|
||||
Regexp* s = NULL;
|
||||
CHECK_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
|
||||
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf") << " " << re->Dump();
|
||||
if (t.return_value) {
|
||||
CHECK_EQ(p, string(t.prefix))
|
||||
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
|
||||
CHECK_EQ(f, t.foldcase)
|
||||
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
|
||||
CHECK_EQ(s->ToString(), string(t.suffix))
|
||||
<< " " << t.regexp << " " << (j==0 ? "latin1" : "utf");
|
||||
s->Decref();
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,325 @@
|
|||
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/testing/tester.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct RegexpTest {
|
||||
const char* regexp;
|
||||
const char* text;
|
||||
};
|
||||
|
||||
RegexpTest simple_tests[] = {
|
||||
{ "a", "a" },
|
||||
{ "a", "zyzzyva" },
|
||||
{ "a+", "aa" },
|
||||
{ "(a+|b)+", "ab" },
|
||||
{ "ab|cd", "xabcdx" },
|
||||
{ "h.*od?", "hello\ngoodbye\n" },
|
||||
{ "h.*o", "hello\ngoodbye\n" },
|
||||
{ "h.*o", "goodbye\nhello\n" },
|
||||
{ "h.*o", "hello world" },
|
||||
{ "h.*o", "othello, world" },
|
||||
{ "[^\\s\\S]", "aaaaaaa" },
|
||||
{ "a", "aaaaaaa" },
|
||||
{ "a*", "aaaaaaa" },
|
||||
{ "a*", "" },
|
||||
{ "a*", NULL },
|
||||
{ "ab|cd", "xabcdx" },
|
||||
{ "a", "cab" },
|
||||
{ "a*b", "cab" },
|
||||
{ "((((((((((((((((((((x))))))))))))))))))))", "x" },
|
||||
{ "[abcd]", "xxxabcdxxx" },
|
||||
{ "[^x]", "xxxabcdxxx" },
|
||||
{ "[abcd]+", "xxxabcdxxx" },
|
||||
{ "[^x]+", "xxxabcdxxx" },
|
||||
{ "(fo|foo)", "fo" },
|
||||
{ "(foo|fo)", "foo" },
|
||||
|
||||
{ "aa", "aA" },
|
||||
{ "a", "Aa" },
|
||||
{ "a", "A" },
|
||||
{ "ABC", "abc" },
|
||||
{ "abc", "XABCY" },
|
||||
{ "ABC", "xabcy" },
|
||||
|
||||
// Make sure ^ and $ work.
|
||||
// The pathological cases didn't work
|
||||
// in the original grep code.
|
||||
{ "foo|bar|[A-Z]", "foo" },
|
||||
{ "^(foo|bar|[A-Z])", "foo" },
|
||||
{ "(foo|bar|[A-Z])$", "foo\n" },
|
||||
{ "(foo|bar|[A-Z])$", "foo" },
|
||||
{ "^(foo|bar|[A-Z])$", "foo\n" },
|
||||
{ "^(foo|bar|[A-Z])$", "foo" },
|
||||
{ "^(foo|bar|[A-Z])$", "bar" },
|
||||
{ "^(foo|bar|[A-Z])$", "X" },
|
||||
{ "^(foo|bar|[A-Z])$", "XY" },
|
||||
{ "^(fo|foo)$", "fo" },
|
||||
{ "^(fo|foo)$", "foo" },
|
||||
{ "^^(fo|foo)$", "fo" },
|
||||
{ "^^(fo|foo)$", "foo" },
|
||||
{ "^$", "" },
|
||||
{ "^$", "x" },
|
||||
{ "^^$", "" },
|
||||
{ "^$$", "" },
|
||||
{ "^^$", "x" },
|
||||
{ "^$$", "x" },
|
||||
{ "^^$$", "" },
|
||||
{ "^^$$", "x" },
|
||||
{ "^^^^^^^^$$$$$$$$", "" },
|
||||
{ "^", "x" },
|
||||
{ "$", "x" },
|
||||
|
||||
// Word boundaries.
|
||||
{ "\\bfoo\\b", "nofoo foo that" },
|
||||
{ "a\\b", "faoa x" },
|
||||
{ "\\bbar", "bar x" },
|
||||
{ "\\bbar", "foo\nbar x" },
|
||||
{ "bar\\b", "foobar" },
|
||||
{ "bar\\b", "foobar\nxxx" },
|
||||
{ "(foo|bar|[A-Z])\\b", "foo" },
|
||||
{ "(foo|bar|[A-Z])\\b", "foo\n" },
|
||||
{ "\\b", "" },
|
||||
{ "\\b", "x" },
|
||||
{ "\\b(foo|bar|[A-Z])", "foo" },
|
||||
{ "\\b(foo|bar|[A-Z])\\b", "X" },
|
||||
{ "\\b(foo|bar|[A-Z])\\b", "XY" },
|
||||
{ "\\b(foo|bar|[A-Z])\\b", "bar" },
|
||||
{ "\\b(foo|bar|[A-Z])\\b", "foo" },
|
||||
{ "\\b(foo|bar|[A-Z])\\b", "foo\n" },
|
||||
{ "\\b(foo|bar|[A-Z])\\b", "ffoo bbar N x" },
|
||||
{ "\\b(fo|foo)\\b", "fo" },
|
||||
{ "\\b(fo|foo)\\b", "foo" },
|
||||
{ "\\b\\b", "" },
|
||||
{ "\\b\\b", "x" },
|
||||
{ "\\b$", "" },
|
||||
{ "\\b$", "x" },
|
||||
{ "\\b$", "y x" },
|
||||
{ "\\b.$", "x" },
|
||||
{ "^\\b(fo|foo)\\b", "fo" },
|
||||
{ "^\\b(fo|foo)\\b", "foo" },
|
||||
{ "^\\b", "" },
|
||||
{ "^\\b", "x" },
|
||||
{ "^\\b\\b", "" },
|
||||
{ "^\\b\\b", "x" },
|
||||
{ "^\\b$", "" },
|
||||
{ "^\\b$", "x" },
|
||||
{ "^\\b.$", "x" },
|
||||
{ "^\\b.\\b$", "x" },
|
||||
{ "^^^^^^^^\\b$$$$$$$", "" },
|
||||
{ "^^^^^^^^\\b.$$$$$$", "x" },
|
||||
{ "^^^^^^^^\\b$$$$$$$", "x" },
|
||||
|
||||
// Non-word boundaries.
|
||||
{ "\\Bfoo\\B", "n foo xfoox that" },
|
||||
{ "a\\B", "faoa x" },
|
||||
{ "\\Bbar", "bar x" },
|
||||
{ "\\Bbar", "foo\nbar x" },
|
||||
{ "bar\\B", "foobar" },
|
||||
{ "bar\\B", "foobar\nxxx" },
|
||||
{ "(foo|bar|[A-Z])\\B", "foox" },
|
||||
{ "(foo|bar|[A-Z])\\B", "foo\n" },
|
||||
{ "\\B", "" },
|
||||
{ "\\B", "x" },
|
||||
{ "\\B(foo|bar|[A-Z])", "foo" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "xXy" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "XY" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "XYZ" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "abara" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "xfoo_" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "xfoo\n" },
|
||||
{ "\\B(foo|bar|[A-Z])\\B", "foo bar vNx" },
|
||||
{ "\\B(fo|foo)\\B", "xfoo" },
|
||||
{ "\\B(foo|fo)\\B", "xfooo" },
|
||||
{ "\\B\\B", "" },
|
||||
{ "\\B\\B", "x" },
|
||||
{ "\\B$", "" },
|
||||
{ "\\B$", "x" },
|
||||
{ "\\B$", "y x" },
|
||||
{ "\\B.$", "x" },
|
||||
{ "^\\B(fo|foo)\\B", "fo" },
|
||||
{ "^\\B(fo|foo)\\B", "foo" },
|
||||
{ "^\\B", "" },
|
||||
{ "^\\B", "x" },
|
||||
{ "^\\B\\B", "" },
|
||||
{ "^\\B\\B", "x" },
|
||||
{ "^\\B$", "" },
|
||||
{ "^\\B$", "x" },
|
||||
{ "^\\B.$", "x" },
|
||||
{ "^\\B.\\B$", "x" },
|
||||
{ "^^^^^^^^\\B$$$$$$$", "" },
|
||||
{ "^^^^^^^^\\B.$$$$$$", "x" },
|
||||
{ "^^^^^^^^\\B$$$$$$$", "x" },
|
||||
|
||||
// PCRE uses only ASCII for \b computation.
|
||||
// All non-ASCII are *not* word characters.
|
||||
{ "\\bx\\b", "x" },
|
||||
{ "\\bx\\b", "x>" },
|
||||
{ "\\bx\\b", "<x" },
|
||||
{ "\\bx\\b", "<x>" },
|
||||
{ "\\bx\\b", "ax" },
|
||||
{ "\\bx\\b", "xb" },
|
||||
{ "\\bx\\b", "axb" },
|
||||
{ "\\bx\\b", "«x" },
|
||||
{ "\\bx\\b", "x»" },
|
||||
{ "\\bx\\b", "«x»" },
|
||||
{ "\\bx\\b", "axb" },
|
||||
{ "\\bx\\b", "áxβ" },
|
||||
{ "\\Bx\\B", "axb" },
|
||||
{ "\\Bx\\B", "áxβ" },
|
||||
|
||||
// Weird boundary cases.
|
||||
{ "^$^$", "" },
|
||||
{ "^$^", "" },
|
||||
{ "$^$", "" },
|
||||
|
||||
{ "^$^$", "x" },
|
||||
{ "^$^", "x" },
|
||||
{ "$^$", "x" },
|
||||
|
||||
{ "^$^$", "x\ny" },
|
||||
{ "^$^", "x\ny" },
|
||||
{ "$^$", "x\ny" },
|
||||
|
||||
{ "^$^$", "x\n\ny" },
|
||||
{ "^$^", "x\n\ny" },
|
||||
{ "$^$", "x\n\ny" },
|
||||
|
||||
{ "^(foo\\$)$", "foo$bar" },
|
||||
{ "(foo\\$)", "foo$bar" },
|
||||
{ "^...$", "abc" },
|
||||
|
||||
// UTF-8
|
||||
{ "^\xe6\x9c\xac$", "\xe6\x9c\xac" },
|
||||
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
|
||||
{ "^...$", ".\xe6\x9c\xac." },
|
||||
|
||||
{ "^\\C\\C\\C$", "\xe6\x9c\xac" },
|
||||
{ "^\\C$", "\xe6\x9c\xac" },
|
||||
{ "^\\C\\C\\C$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
|
||||
|
||||
// Latin1
|
||||
{ "^...$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
|
||||
{ "^.........$", "\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e" },
|
||||
{ "^...$", ".\xe6\x9c\xac." },
|
||||
{ "^.....$", ".\xe6\x9c\xac." },
|
||||
|
||||
// Perl v Posix
|
||||
{ "\\B(fo|foo)\\B", "xfooo" },
|
||||
{ "(fo|foo)", "foo" },
|
||||
|
||||
// Octal escapes.
|
||||
{ "\\141", "a" },
|
||||
{ "\\060", "0" },
|
||||
{ "\\0600", "00" },
|
||||
{ "\\608", "08" },
|
||||
{ "\\01", "\01" },
|
||||
{ "\\018", "\01" "8" },
|
||||
|
||||
// Hexadecimal escapes
|
||||
{ "\\x{61}", "a" },
|
||||
{ "\\x61", "a" },
|
||||
{ "\\x{00000061}", "a" },
|
||||
|
||||
// Unicode scripts.
|
||||
{ "\\p{Greek}+", "aαβb" },
|
||||
{ "\\P{Greek}+", "aαβb" },
|
||||
{ "\\p{^Greek}+", "aαβb" },
|
||||
{ "\\P{^Greek}+", "aαβb" },
|
||||
|
||||
// Unicode properties. Nd is decimal number. N is any number.
|
||||
{ "[^0-9]+", "abc123" },
|
||||
{ "\\p{Nd}+", "abc123²³¼½¾₀₉" },
|
||||
{ "\\p{^Nd}+", "abc123²³¼½¾₀₉" },
|
||||
{ "\\P{Nd}+", "abc123²³¼½¾₀₉" },
|
||||
{ "\\P{^Nd}+", "abc123²³¼½¾₀₉" },
|
||||
{ "\\pN+", "abc123²³¼½¾₀₉" },
|
||||
{ "\\p{N}+", "abc123²³¼½¾₀₉" },
|
||||
{ "\\p{^N}+", "abc123²³¼½¾₀₉" },
|
||||
|
||||
{ "\\p{Any}+", "abc123" },
|
||||
|
||||
// Character classes & case folding.
|
||||
{ "(?i)[@-A]+", "@AaB" }, // matches @Aa but not B
|
||||
{ "(?i)[A-Z]+", "aAzZ" },
|
||||
{ "(?i)[^\\\\]+", "Aa\\" }, // \\ is between A-Z and a-z -
|
||||
// splits the ranges in an interesting way.
|
||||
|
||||
// would like to use, but PCRE mishandles in full-match, non-greedy mode
|
||||
// { "(?i)[\\\\]+", "Aa" },
|
||||
|
||||
{ "(?i)[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
|
||||
|
||||
// Character classes & case folding.
|
||||
{ "[@-A]+", "@AaB" },
|
||||
{ "[A-Z]+", "aAzZ" },
|
||||
{ "[^\\\\]+", "Aa\\" },
|
||||
{ "[acegikmoqsuwy]+", "acegikmoqsuwyACEGIKMOQSUWY" },
|
||||
|
||||
// Anchoring. (^abc in aabcdef was a former bug)
|
||||
// The tester checks for a match in the text and
|
||||
// subpieces of the text with a byte removed on either side.
|
||||
{ "^abc", "abcdef" },
|
||||
{ "^abc", "aabcdef" },
|
||||
{ "^[ay]*[bx]+c", "abcdef" },
|
||||
{ "^[ay]*[bx]+c", "aabcdef" },
|
||||
{ "def$", "abcdef" },
|
||||
{ "def$", "abcdeff" },
|
||||
{ "d[ex][fy]$", "abcdef" },
|
||||
{ "d[ex][fy]$", "abcdeff" },
|
||||
{ "[dz][ex][fy]$", "abcdef" },
|
||||
{ "[dz][ex][fy]$", "abcdeff" },
|
||||
{ "(?m)^abc", "abcdef" },
|
||||
{ "(?m)^abc", "aabcdef" },
|
||||
{ "(?m)^[ay]*[bx]+c", "abcdef" },
|
||||
{ "(?m)^[ay]*[bx]+c", "aabcdef" },
|
||||
{ "(?m)def$", "abcdef" },
|
||||
{ "(?m)def$", "abcdeff" },
|
||||
{ "(?m)d[ex][fy]$", "abcdef" },
|
||||
{ "(?m)d[ex][fy]$", "abcdeff" },
|
||||
{ "(?m)[dz][ex][fy]$", "abcdef" },
|
||||
{ "(?m)[dz][ex][fy]$", "abcdeff" },
|
||||
{ "^", "a" },
|
||||
{ "^^", "a" },
|
||||
|
||||
// Context.
|
||||
// The tester checks for a match in the text and
|
||||
// subpieces of the text with a byte removed on either side.
|
||||
{ "a", "a" },
|
||||
{ "ab*", "a" },
|
||||
{ "a\\C*", "a" },
|
||||
|
||||
// Former bugs.
|
||||
{ "a\\C*|ba\\C", "baba" },
|
||||
};
|
||||
|
||||
TEST(Regexp, SearchTests) {
|
||||
int failures = 0;
|
||||
for (int i = 0; i < arraysize(simple_tests); i++) {
|
||||
const RegexpTest& t = simple_tests[i];
|
||||
if (!TestRegexpOnText(t.regexp, t.text))
|
||||
failures++;
|
||||
|
||||
#ifdef LOGGING
|
||||
// Build a dummy ExhaustiveTest call that will trigger just
|
||||
// this one test, so that we log the test case.
|
||||
vector<string> atom, alpha, ops;
|
||||
atom.push_back(StringPiece(t.regexp).as_string());
|
||||
alpha.push_back(StringPiece(t.text).as_string());
|
||||
ExhaustiveTest(1, 0, atom, ops, 1, alpha, "", "");
|
||||
#endif
|
||||
|
||||
}
|
||||
EXPECT_EQ(failures, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <vector>
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/set.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
TEST(Set, Unanchored) {
|
||||
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
|
||||
|
||||
CHECK_EQ(s.Add("foo", NULL), 0);
|
||||
CHECK_EQ(s.Add("(", NULL), -1);
|
||||
CHECK_EQ(s.Add("bar", NULL), 1);
|
||||
|
||||
CHECK_EQ(s.Compile(), true);
|
||||
|
||||
vector<int> v;
|
||||
CHECK_EQ(s.Match("foobar", &v), true);
|
||||
CHECK_EQ(v.size(), 2);
|
||||
CHECK_EQ(v[0], 0);
|
||||
CHECK_EQ(v[1], 1);
|
||||
|
||||
v.clear();
|
||||
CHECK_EQ(s.Match("fooba", &v), true);
|
||||
CHECK_EQ(v.size(), 1);
|
||||
CHECK_EQ(v[0], 0);
|
||||
|
||||
v.clear();
|
||||
CHECK_EQ(s.Match("oobar", &v), true);
|
||||
CHECK_EQ(v.size(), 1);
|
||||
CHECK_EQ(v[0], 1);
|
||||
}
|
||||
|
||||
TEST(Set, UnanchoredFactored) {
|
||||
RE2::Set s(RE2::DefaultOptions, RE2::UNANCHORED);
|
||||
|
||||
CHECK_EQ(s.Add("foo", NULL), 0);
|
||||
CHECK_EQ(s.Add("(", NULL), -1);
|
||||
CHECK_EQ(s.Add("foobar", NULL), 1);
|
||||
|
||||
CHECK_EQ(s.Compile(), true);
|
||||
|
||||
vector<int> v;
|
||||
CHECK_EQ(s.Match("foobar", &v), true);
|
||||
CHECK_EQ(v.size(), 2);
|
||||
CHECK_EQ(v[0], 0);
|
||||
CHECK_EQ(v[1], 1);
|
||||
|
||||
v.clear();
|
||||
CHECK_EQ(s.Match("obarfoobaroo", &v), true);
|
||||
CHECK_EQ(v.size(), 2);
|
||||
CHECK_EQ(v[0], 0);
|
||||
CHECK_EQ(v[1], 1);
|
||||
|
||||
v.clear();
|
||||
CHECK_EQ(s.Match("fooba", &v), true);
|
||||
CHECK_EQ(v.size(), 1);
|
||||
CHECK_EQ(v[0], 0);
|
||||
|
||||
v.clear();
|
||||
CHECK_EQ(s.Match("oobar", &v), false);
|
||||
CHECK_EQ(v.size(), 0);
|
||||
}
|
||||
|
||||
TEST(Set, Anchored) {
|
||||
RE2::Set s(RE2::DefaultOptions, RE2::ANCHOR_BOTH);
|
||||
|
||||
CHECK_EQ(s.Add("foo", NULL), 0);
|
||||
CHECK_EQ(s.Add("(", NULL), -1);
|
||||
CHECK_EQ(s.Add("bar", NULL), 1);
|
||||
|
||||
CHECK_EQ(s.Compile(), true);
|
||||
|
||||
vector<int> v;
|
||||
CHECK_EQ(s.Match("foobar", &v), false);
|
||||
CHECK_EQ(v.size(), 0);
|
||||
|
||||
CHECK_EQ(s.Match("fooba", &v), false);
|
||||
CHECK_EQ(v.size(), 0);
|
||||
|
||||
CHECK_EQ(s.Match("oobar", &v), false);
|
||||
CHECK_EQ(v.size(), 0);
|
||||
|
||||
CHECK_EQ(s.Match("foo", &v), true);
|
||||
CHECK_EQ(v.size(), 1);
|
||||
CHECK_EQ(v[0], 0);
|
||||
|
||||
CHECK_EQ(s.Match("bar", &v), true);
|
||||
CHECK_EQ(v.size(), 1);
|
||||
CHECK_EQ(v[0], 1);
|
||||
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test simplify.cc.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct Test {
|
||||
const char* regexp;
|
||||
const char* simplified;
|
||||
};
|
||||
|
||||
static Test tests[] = {
|
||||
// Already-simple constructs
|
||||
{ "a", "a" },
|
||||
{ "ab", "ab" },
|
||||
{ "a|b", "[a-b]" },
|
||||
{ "ab|cd", "ab|cd" },
|
||||
{ "(ab)*", "(ab)*" },
|
||||
{ "(ab)+", "(ab)+" },
|
||||
{ "(ab)?", "(ab)?" },
|
||||
{ ".", "." },
|
||||
{ "^", "^" },
|
||||
{ "$", "$" },
|
||||
{ "[ac]", "[ac]" },
|
||||
{ "[^ac]", "[^ac]" },
|
||||
|
||||
// Posix character classes
|
||||
{ "[[:alnum:]]", "[0-9A-Za-z]" },
|
||||
{ "[[:alpha:]]", "[A-Za-z]" },
|
||||
{ "[[:blank:]]", "[\\t ]" },
|
||||
{ "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
|
||||
{ "[[:digit:]]", "[0-9]" },
|
||||
{ "[[:graph:]]", "[!-~]" },
|
||||
{ "[[:lower:]]", "[a-z]" },
|
||||
{ "[[:print:]]", "[ -~]" },
|
||||
{ "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
|
||||
{ "[[:space:]]" , "[\\t-\\r ]" },
|
||||
{ "[[:upper:]]", "[A-Z]" },
|
||||
{ "[[:xdigit:]]", "[0-9A-Fa-f]" },
|
||||
|
||||
// Perl character classes
|
||||
{ "\\d", "[0-9]" },
|
||||
{ "\\s", "[\\t-\\n\\f-\\r ]" },
|
||||
{ "\\w", "[0-9A-Z_a-z]" },
|
||||
{ "\\D", "[^0-9]" },
|
||||
{ "\\S", "[^\\t-\\n\\f-\\r ]" },
|
||||
{ "\\W", "[^0-9A-Z_a-z]" },
|
||||
{ "[\\d]", "[0-9]" },
|
||||
{ "[\\s]", "[\\t-\\n\\f-\\r ]" },
|
||||
{ "[\\w]", "[0-9A-Z_a-z]" },
|
||||
{ "[\\D]", "[^0-9]" },
|
||||
{ "[\\S]", "[^\\t-\\n\\f-\\r ]" },
|
||||
{ "[\\W]", "[^0-9A-Z_a-z]" },
|
||||
|
||||
// Posix repetitions
|
||||
{ "a{1}", "a" },
|
||||
{ "a{2}", "aa" },
|
||||
{ "a{5}", "aaaaa" },
|
||||
{ "a{0,1}", "a?" },
|
||||
// The next three are illegible because Simplify inserts (?:)
|
||||
// parens instead of () parens to avoid creating extra
|
||||
// captured subexpressions. The comments show a version fewer parens.
|
||||
{ "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
|
||||
{ "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
|
||||
{ "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
|
||||
{ "a{0,2}", "(?:aa?)?" }, // (aa?)?
|
||||
{ "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
|
||||
{ "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
|
||||
{ "a{0,}", "a*" },
|
||||
{ "a{1,}", "a+" },
|
||||
{ "a{2,}", "aa+" },
|
||||
{ "a{5,}", "aaaaa+" },
|
||||
|
||||
// Test that operators simplify their arguments.
|
||||
// (Simplify used to not simplify arguments to a {} repeat.)
|
||||
{ "(?:a{1,}){1,}", "a+" },
|
||||
{ "(a{1,}b{1,})", "(a+b+)" },
|
||||
{ "a{1,}|b{1,}", "a+|b+" },
|
||||
{ "(?:a{1,})*", "(?:a+)*" },
|
||||
{ "(?:a{1,})+", "a+" },
|
||||
{ "(?:a{1,})?", "(?:a+)?" },
|
||||
{ "a{0}", "" },
|
||||
|
||||
// Character class simplification
|
||||
{ "[ab]", "[a-b]" },
|
||||
{ "[a-za-za-z]", "[a-z]" },
|
||||
{ "[A-Za-zA-Za-z]", "[A-Za-z]" },
|
||||
{ "[ABCDEFGH]", "[A-H]" },
|
||||
{ "[AB-CD-EF-GH]", "[A-H]" },
|
||||
{ "[W-ZP-XE-R]", "[E-Z]" },
|
||||
{ "[a-ee-gg-m]", "[a-m]" },
|
||||
{ "[a-ea-ha-m]", "[a-m]" },
|
||||
{ "[a-ma-ha-e]", "[a-m]" },
|
||||
{ "[a-zA-Z0-9 -~]", "[ -~]" },
|
||||
|
||||
// Empty character classes
|
||||
{ "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
|
||||
|
||||
// Full character classes
|
||||
{ "[[:cntrl:][:^cntrl:]]", "." },
|
||||
|
||||
// Unicode case folding.
|
||||
{ "(?i)A", "[Aa]" },
|
||||
{ "(?i)a", "[Aa]" },
|
||||
{ "(?i)K", "[Kk\\x{212a}]" },
|
||||
{ "(?i)k", "[Kk\\x{212a}]" },
|
||||
{ "(?i)\\x{212a}", "[Kk\\x{212a}]" },
|
||||
{ "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
|
||||
{ "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
|
||||
{ "(?i)[\\x00-\\x{10ffff}]", "." },
|
||||
|
||||
// Empty string as a regular expression.
|
||||
// Empty string must be preserved inside parens in order
|
||||
// to make submatches work right, so these are less
|
||||
// interesting than they used to be. ToString inserts
|
||||
// explicit (?:) in place of non-parenthesized empty strings,
|
||||
// to make them easier to spot for other parsers.
|
||||
{ "(a|b|)", "([a-b]|(?:))" },
|
||||
{ "(|)", "()" },
|
||||
{ "a()", "a()" },
|
||||
{ "(()|())", "(()|())" },
|
||||
{ "(a|)", "(a|(?:))" },
|
||||
{ "ab()cd()", "ab()cd()" },
|
||||
{ "()", "()" },
|
||||
{ "()*", "()*" },
|
||||
{ "()+", "()+" },
|
||||
{ "()?" , "()?" },
|
||||
{ "(){0}", "" },
|
||||
{ "(){1}", "()" },
|
||||
{ "(){1,}", "()+" },
|
||||
{ "(){0,2}", "(?:()()?)?" },
|
||||
};
|
||||
|
||||
TEST(TestSimplify, SimpleRegexps) {
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
RegexpStatus status;
|
||||
VLOG(1) << "Testing " << tests[i].regexp;
|
||||
Regexp* re = Regexp::Parse(tests[i].regexp,
|
||||
Regexp::MatchNL | (Regexp::LikePerl &
|
||||
~Regexp::OneLine),
|
||||
&status);
|
||||
CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
|
||||
Regexp* sre = re->Simplify();
|
||||
CHECK(sre != NULL);
|
||||
|
||||
// Check that already-simple regexps don't allocate new ones.
|
||||
if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
|
||||
CHECK(re == sre) << " " << tests[i].regexp
|
||||
<< " " << re->ToString() << " " << sre->ToString();
|
||||
}
|
||||
|
||||
EXPECT_EQ(tests[i].simplified, sre->ToString())
|
||||
<< " " << tests[i].regexp << " " << sre->Dump();
|
||||
|
||||
re->Decref();
|
||||
sre->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,113 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// String generator: generates all possible strings of up to
|
||||
// maxlen letters using the set of letters in alpha.
|
||||
// Fetch strings using a Java-like Next()/HasNext() interface.
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
StringGenerator::StringGenerator(int maxlen, const vector<string>& alphabet)
|
||||
: maxlen_(maxlen), alphabet_(alphabet),
|
||||
generate_null_(false),
|
||||
random_(false), nrandom_(0), acm_(NULL) {
|
||||
|
||||
// Degenerate case: no letters, no non-empty strings.
|
||||
if (alphabet_.size() == 0)
|
||||
maxlen_ = 0;
|
||||
|
||||
// Next() will return empty string (digits_ is empty).
|
||||
hasnext_ = true;
|
||||
}
|
||||
|
||||
StringGenerator::~StringGenerator() {
|
||||
delete acm_;
|
||||
}
|
||||
|
||||
// Resets the string generator state to the beginning.
|
||||
void StringGenerator::Reset() {
|
||||
digits_.clear();
|
||||
hasnext_ = true;
|
||||
random_ = false;
|
||||
nrandom_ = 0;
|
||||
generate_null_ = false;
|
||||
}
|
||||
|
||||
// Increments the big number in digits_, returning true if successful.
|
||||
// Returns false if all the numbers have been used.
|
||||
bool StringGenerator::IncrementDigits() {
|
||||
// First try to increment the current number.
|
||||
for (int i = digits_.size() - 1; i >= 0; i--) {
|
||||
if (++digits_[i] < alphabet_.size())
|
||||
return true;
|
||||
digits_[i] = 0;
|
||||
}
|
||||
|
||||
// If that failed, make a longer number.
|
||||
if (digits_.size() < maxlen_) {
|
||||
digits_.push_back(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Generates random digits_, return true if successful.
|
||||
// Returns false if the random sequence is over.
|
||||
bool StringGenerator::RandomDigits() {
|
||||
if (--nrandom_ <= 0)
|
||||
return false;
|
||||
|
||||
// Pick length.
|
||||
int len = acm_->Uniform(maxlen_+1);
|
||||
digits_.resize(len);
|
||||
for (int i = 0; i < len; i++)
|
||||
digits_[i] = acm_->Uniform(alphabet_.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the next string in the iteration, which is the one
|
||||
// currently described by digits_. Calls IncrementDigits
|
||||
// after computing the string, so that it knows the answer
|
||||
// for subsequent HasNext() calls.
|
||||
const StringPiece& StringGenerator::Next() {
|
||||
CHECK(hasnext_);
|
||||
if (generate_null_) {
|
||||
generate_null_ = false;
|
||||
sp_ = NULL;
|
||||
return sp_;
|
||||
}
|
||||
s_.clear();
|
||||
for (int i = 0; i < digits_.size(); i++) {
|
||||
s_ += alphabet_[digits_[i]];
|
||||
}
|
||||
hasnext_ = random_ ? RandomDigits() : IncrementDigits();
|
||||
sp_ = s_;
|
||||
return sp_;
|
||||
}
|
||||
|
||||
// Sets generator up to return n random strings.
|
||||
void StringGenerator::Random(int32 seed, int n) {
|
||||
if (acm_ == NULL)
|
||||
acm_ = new ACMRandom(seed);
|
||||
else
|
||||
acm_->Reset(seed);
|
||||
|
||||
random_ = true;
|
||||
nrandom_ = n;
|
||||
hasnext_ = nrandom_ > 0;
|
||||
}
|
||||
|
||||
void StringGenerator::GenerateNULL() {
|
||||
generate_null_ = true;
|
||||
hasnext_ = true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// String generator: generates all possible strings of up to
|
||||
// maxlen letters using the set of letters in alpha.
|
||||
// Fetch strings using a Java-like Next()/HasNext() interface.
|
||||
|
||||
#ifndef RE2_TESTING_STRING_GENERATOR_H__
|
||||
#define RE2_TESTING_STRING_GENERATOR_H__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/util.h"
|
||||
#include "util/random.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class StringGenerator {
|
||||
public:
|
||||
StringGenerator(int maxlen, const vector<string>& alphabet);
|
||||
~StringGenerator();
|
||||
const StringPiece& Next();
|
||||
bool HasNext() { return hasnext_; }
|
||||
|
||||
// Resets generator to start sequence over.
|
||||
void Reset();
|
||||
|
||||
// Causes generator to emit random strings for next n calls to Next().
|
||||
void Random(int32 seed, int n);
|
||||
|
||||
// Causes generator to emit a NULL as the next call.
|
||||
void GenerateNULL();
|
||||
|
||||
private:
|
||||
bool IncrementDigits();
|
||||
bool RandomDigits();
|
||||
|
||||
// Global state.
|
||||
int maxlen_; // Maximum length string to generate.
|
||||
vector<string> alphabet_; // Alphabet, one string per letter.
|
||||
|
||||
// Iteration state.
|
||||
StringPiece sp_; // Last StringPiece returned by Next().
|
||||
string s_; // String data in last StringPiece returned by Next().
|
||||
bool hasnext_; // Whether Next() can be called again.
|
||||
vector<int> digits_; // Alphabet indices for next string.
|
||||
bool generate_null_; // Whether to generate a NULL StringPiece next.
|
||||
bool random_; // Whether generated strings are random.
|
||||
int nrandom_; // Number of random strings left to generate.
|
||||
ACMRandom* acm_; // Random number generator
|
||||
DISALLOW_EVIL_CONSTRUCTORS(StringGenerator);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_STRING_GENERATOR_H__
|
|
@ -0,0 +1,109 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test StringGenerator.
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Returns i to the e.
|
||||
static int64 IntegerPower(int i, int e) {
|
||||
int64 p = 1;
|
||||
while (e-- > 0)
|
||||
p *= i;
|
||||
return p;
|
||||
}
|
||||
|
||||
// Checks that for given settings of the string generator:
|
||||
// * it generates strings that are non-decreasing in length.
|
||||
// * strings of the same length are sorted in alphabet order.
|
||||
// * it doesn't generate the same string twice.
|
||||
// * it generates the right number of strings.
|
||||
//
|
||||
// If all of these hold, the StringGenerator is behaving.
|
||||
// Assumes that the alphabet is sorted, so that the generated
|
||||
// strings can just be compared lexicographically.
|
||||
static void RunTest(int len, string alphabet, bool donull) {
|
||||
StringGenerator g(len, Explode(alphabet));
|
||||
|
||||
int n = 0;
|
||||
int last_l = -1;
|
||||
string last_s;
|
||||
|
||||
if (donull) {
|
||||
g.GenerateNULL();
|
||||
EXPECT_TRUE(g.HasNext());
|
||||
StringPiece sp = g.Next();
|
||||
EXPECT_EQ(sp.data(), static_cast<const char*>(NULL));
|
||||
EXPECT_EQ(sp.size(), 0);
|
||||
}
|
||||
|
||||
while (g.HasNext()) {
|
||||
string s = g.Next().as_string();
|
||||
n++;
|
||||
|
||||
// Check that all characters in s appear in alphabet.
|
||||
for (const char *p = s.c_str(); *p != '\0'; ) {
|
||||
Rune r;
|
||||
p += chartorune(&r, p);
|
||||
EXPECT_TRUE(utfrune(alphabet.c_str(), r) != NULL);
|
||||
}
|
||||
|
||||
// Check that string is properly ordered w.r.t. previous string.
|
||||
int l = utflen(s.c_str());
|
||||
EXPECT_LE(l, len);
|
||||
if (last_l < l) {
|
||||
last_l = l;
|
||||
} else {
|
||||
EXPECT_EQ(last_l, l);
|
||||
EXPECT_LT(last_s, s);
|
||||
}
|
||||
last_s = s;
|
||||
}
|
||||
|
||||
// Check total string count.
|
||||
int64 m = 0;
|
||||
int alpha = utflen(alphabet.c_str());
|
||||
if (alpha == 0) // Degenerate case.
|
||||
len = 0;
|
||||
for (int i = 0; i <= len; i++)
|
||||
m += IntegerPower(alpha, i);
|
||||
EXPECT_EQ(n, m);
|
||||
}
|
||||
|
||||
TEST(StringGenerator, NoLength) {
|
||||
RunTest(0, "abc", false);
|
||||
}
|
||||
|
||||
TEST(StringGenerator, NoLengthNoAlphabet) {
|
||||
RunTest(0, "", false);
|
||||
}
|
||||
|
||||
TEST(StringGenerator, NoAlphabet) {
|
||||
RunTest(5, "", false);
|
||||
}
|
||||
|
||||
TEST(StringGenerator, Simple) {
|
||||
RunTest(3, "abc", false);
|
||||
}
|
||||
|
||||
TEST(StringGenerator, UTF8) {
|
||||
RunTest(4, "abc\xE2\x98\xBA", false);
|
||||
}
|
||||
|
||||
TEST(StringGenerator, GenNULL) {
|
||||
RunTest(0, "abc", true);
|
||||
RunTest(0, "", true);
|
||||
RunTest(5, "", true);
|
||||
RunTest(3, "abc", true);
|
||||
RunTest(4, "abc\xE2\x98\xBA", true);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,640 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression engine tester -- test all the implementations against each other.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "re2/testing/tester.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
DEFINE_bool(dump_prog, false, "dump regexp program");
|
||||
DEFINE_bool(log_okay, false, "log successful runs");
|
||||
DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
|
||||
|
||||
DEFINE_int32(max_regexp_failures, 100,
|
||||
"maximum number of regexp test failures (-1 = unlimited)");
|
||||
|
||||
DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
enum {
|
||||
kMaxSubmatch = 1+16, // $0...$16
|
||||
};
|
||||
|
||||
const char* engine_types[kEngineMax] = {
|
||||
"Backtrack",
|
||||
"NFA",
|
||||
"DFA",
|
||||
"DFA1",
|
||||
"OnePass",
|
||||
"BitState",
|
||||
"RE2",
|
||||
"RE2a",
|
||||
"RE2b",
|
||||
"PCRE",
|
||||
};
|
||||
|
||||
// Returns the name string for the type t.
|
||||
static string EngineString(Engine t) {
|
||||
if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
|
||||
return StringPrintf("type%d", static_cast<int>(t));
|
||||
}
|
||||
return engine_types[t];
|
||||
}
|
||||
|
||||
// Returns bit mask of engines to use.
|
||||
static uint32 Engines() {
|
||||
static uint32 cached_engines;
|
||||
static bool did_parse;
|
||||
|
||||
if (did_parse)
|
||||
return cached_engines;
|
||||
|
||||
if (FLAGS_regexp_engines.empty()) {
|
||||
cached_engines = ~0;
|
||||
} else {
|
||||
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
|
||||
if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
|
||||
cached_engines |= 1<<i;
|
||||
}
|
||||
|
||||
if (cached_engines == 0)
|
||||
LOG(INFO) << "Warning: no engines enabled.";
|
||||
if (!UsingPCRE)
|
||||
cached_engines &= ~(1<<kEnginePCRE);
|
||||
for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
|
||||
if (cached_engines & (1<<i))
|
||||
LOG(INFO) << EngineString(i) << " enabled";
|
||||
}
|
||||
did_parse = true;
|
||||
return cached_engines;
|
||||
}
|
||||
|
||||
// The result of running a match.
|
||||
struct TestInstance::Result {
|
||||
bool skipped; // test skipped: wasn't applicable
|
||||
bool matched; // found a match
|
||||
bool untrusted; // don't really trust the answer
|
||||
bool have_submatch; // computed all submatch info
|
||||
bool have_submatch0; // computed just submatch[0]
|
||||
StringPiece submatch[kMaxSubmatch];
|
||||
};
|
||||
|
||||
typedef TestInstance::Result Result;
|
||||
|
||||
// Formats a single capture range s in text in the form (a,b)
|
||||
// where a and b are the starting and ending offsets of s in text.
|
||||
static string FormatCapture(const StringPiece& text, const StringPiece& s) {
|
||||
if (s.begin() == NULL)
|
||||
return "(?,?)";
|
||||
return StringPrintf("(%d,%d)",
|
||||
static_cast<int>(s.begin() - text.begin()),
|
||||
static_cast<int>(s.end() - text.begin()));
|
||||
}
|
||||
|
||||
// Returns whether text contains non-ASCII (>= 0x80) bytes.
|
||||
static bool NonASCII(const StringPiece& text) {
|
||||
for (int i = 0; i < text.size(); i++)
|
||||
if ((uint8)text[i] >= 0x80)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns string representation of match kind.
|
||||
static string FormatKind(Prog::MatchKind kind) {
|
||||
switch (kind) {
|
||||
case Prog::kFullMatch:
|
||||
return "full match";
|
||||
case Prog::kLongestMatch:
|
||||
return "longest match";
|
||||
case Prog::kFirstMatch:
|
||||
return "first match";
|
||||
case Prog::kManyMatch:
|
||||
return "many match";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
// Returns string representation of anchor kind.
|
||||
static string FormatAnchor(Prog::Anchor anchor) {
|
||||
switch (anchor) {
|
||||
case Prog::kAnchored:
|
||||
return "anchored";
|
||||
case Prog::kUnanchored:
|
||||
return "unanchored";
|
||||
}
|
||||
return "???";
|
||||
}
|
||||
|
||||
struct ParseMode {
|
||||
Regexp::ParseFlags parse_flags;
|
||||
string desc;
|
||||
};
|
||||
|
||||
static const Regexp::ParseFlags single_line =
|
||||
Regexp::LikePerl;
|
||||
static const Regexp::ParseFlags multi_line =
|
||||
static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
|
||||
|
||||
static ParseMode parse_modes[] = {
|
||||
{ single_line, "single-line" },
|
||||
{ single_line|Regexp::Latin1, "single-line, latin1" },
|
||||
{ multi_line, "multiline" },
|
||||
{ multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
|
||||
{ multi_line|Regexp::Latin1, "multiline, latin1" },
|
||||
};
|
||||
|
||||
static string FormatMode(Regexp::ParseFlags flags) {
|
||||
for (int i = 0; i < arraysize(parse_modes); i++)
|
||||
if (parse_modes[i].parse_flags == flags)
|
||||
return parse_modes[i].desc;
|
||||
return StringPrintf("%#x", static_cast<uint>(flags));
|
||||
}
|
||||
|
||||
// Constructs and saves all the matching engines that
|
||||
// will be required for the given tests.
|
||||
TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
|
||||
Regexp::ParseFlags flags)
|
||||
: regexp_str_(regexp_str),
|
||||
kind_(kind),
|
||||
flags_(flags),
|
||||
error_(false),
|
||||
regexp_(NULL),
|
||||
num_captures_(0),
|
||||
prog_(NULL),
|
||||
rprog_(NULL),
|
||||
re_(NULL),
|
||||
re2_(NULL) {
|
||||
|
||||
VLOG(1) << CEscape(regexp_str);
|
||||
|
||||
// Compile regexp to prog.
|
||||
// Always required - needed for backtracking (reference implementation).
|
||||
RegexpStatus status;
|
||||
regexp_ = Regexp::Parse(regexp_str, flags, &status);
|
||||
if (regexp_ == NULL) {
|
||||
LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
|
||||
<< " mode: " << FormatMode(flags);
|
||||
error_ = true;
|
||||
return;
|
||||
}
|
||||
num_captures_ = regexp_->NumCaptures();
|
||||
prog_ = regexp_->CompileToProg(0);
|
||||
if (prog_ == NULL) {
|
||||
LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
|
||||
error_ = true;
|
||||
return;
|
||||
}
|
||||
if (FLAGS_dump_prog) {
|
||||
LOG(INFO) << "Prog for "
|
||||
<< " regexp "
|
||||
<< CEscape(regexp_str_)
|
||||
<< " (" << FormatKind(kind_)
|
||||
<< ", " << FormatMode(flags_)
|
||||
<< ")\n"
|
||||
<< prog_->Dump();
|
||||
}
|
||||
|
||||
// Compile regexp to reversed prog. Only needed for DFA engines.
|
||||
if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
|
||||
rprog_ = regexp_->CompileToReverseProg(0);
|
||||
if (rprog_ == NULL) {
|
||||
LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
|
||||
error_ = true;
|
||||
return;
|
||||
}
|
||||
if (FLAGS_dump_rprog)
|
||||
LOG(INFO) << rprog_->Dump();
|
||||
}
|
||||
|
||||
// Create re string that will be used for RE and RE2.
|
||||
string re = regexp_str.as_string();
|
||||
// Accomodate flags.
|
||||
// Regexp::Latin1 will be accomodated below.
|
||||
if (!(flags & Regexp::OneLine))
|
||||
re = "(?m)" + re;
|
||||
if (flags & Regexp::NonGreedy)
|
||||
re = "(?U)" + re;
|
||||
if (flags & Regexp::DotNL)
|
||||
re = "(?s)" + re;
|
||||
|
||||
// Compile regexp to RE2.
|
||||
if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
|
||||
RE2::Options options;
|
||||
if (flags & Regexp::Latin1)
|
||||
options.set_encoding(RE2::Options::EncodingLatin1);
|
||||
if (kind_ == Prog::kLongestMatch)
|
||||
options.set_longest_match(true);
|
||||
re2_ = new RE2(re, options);
|
||||
if (!re2_->error().empty()) {
|
||||
LOG(INFO) << "Cannot RE2: " << CEscape(re);
|
||||
error_ = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Compile regexp to RE.
|
||||
// PCRE as exposed by the RE interface isn't always usable.
|
||||
// 1. It disagrees about handling of empty-string reptitions
|
||||
// like matching (a*)* against "b". PCRE treats the (a*) as
|
||||
// occurring once, while we treat it as occurring not at all.
|
||||
// 2. It treats $ as this weird thing meaning end of string
|
||||
// or before the \n at the end of the string.
|
||||
// 3. It doesn't implement POSIX leftmost-longest matching.
|
||||
// MimicsPCRE() detects 1 and 2.
|
||||
if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
|
||||
kind_ != Prog::kLongestMatch) {
|
||||
PCRE_Options o;
|
||||
o.set_option(PCRE::UTF8);
|
||||
if (flags & Regexp::Latin1)
|
||||
o.set_option(PCRE::None);
|
||||
// PCRE has interface bug keeping us from finding $0, so
|
||||
// add one more layer of parens.
|
||||
re_ = new PCRE("("+re+")", o);
|
||||
if (!re_->error().empty()) {
|
||||
LOG(INFO) << "Cannot PCRE: " << CEscape(re);
|
||||
error_ = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TestInstance::~TestInstance() {
|
||||
if (regexp_)
|
||||
regexp_->Decref();
|
||||
delete prog_;
|
||||
delete rprog_;
|
||||
delete re_;
|
||||
delete re2_;
|
||||
}
|
||||
|
||||
// Runs a single search using the named engine type.
|
||||
// This interface hides all the irregularities of the various
|
||||
// engine interfaces from the rest of this file.
|
||||
void TestInstance::RunSearch(Engine type,
|
||||
const StringPiece& orig_text,
|
||||
const StringPiece& orig_context,
|
||||
Prog::Anchor anchor,
|
||||
Result *result) {
|
||||
memset(result, 0, sizeof *result);
|
||||
if (regexp_ == NULL) {
|
||||
result->skipped = true;
|
||||
return;
|
||||
}
|
||||
int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
|
||||
if (nsubmatch > kMaxSubmatch)
|
||||
nsubmatch = kMaxSubmatch;
|
||||
|
||||
StringPiece text = orig_text;
|
||||
StringPiece context = orig_context;
|
||||
|
||||
switch (type) {
|
||||
default:
|
||||
LOG(FATAL) << "Bad RunSearch type: " << (int)type;
|
||||
|
||||
case kEngineBacktrack:
|
||||
if (prog_ == NULL) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
result->matched =
|
||||
prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
|
||||
result->submatch, nsubmatch);
|
||||
result->have_submatch = true;
|
||||
break;
|
||||
|
||||
case kEngineNFA:
|
||||
if (prog_ == NULL) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
result->matched =
|
||||
prog_->SearchNFA(text, context, anchor, kind_,
|
||||
result->submatch, nsubmatch);
|
||||
result->have_submatch = true;
|
||||
break;
|
||||
|
||||
case kEngineDFA:
|
||||
if (prog_ == NULL) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
|
||||
&result->skipped, NULL);
|
||||
break;
|
||||
|
||||
case kEngineDFA1:
|
||||
if (prog_ == NULL || rprog_ == NULL) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
result->matched =
|
||||
prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
|
||||
&result->skipped, NULL);
|
||||
// If anchored, no need for second run,
|
||||
// but do it anyway to find more bugs.
|
||||
if (result->matched) {
|
||||
if (!rprog_->SearchDFA(result->submatch[0], context,
|
||||
Prog::kAnchored, Prog::kLongestMatch,
|
||||
result->submatch,
|
||||
&result->skipped, NULL)) {
|
||||
LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
|
||||
<< " on " << CEscape(text);
|
||||
result->matched = false;
|
||||
}
|
||||
}
|
||||
result->have_submatch0 = true;
|
||||
break;
|
||||
|
||||
case kEngineOnePass:
|
||||
if (prog_ == NULL ||
|
||||
anchor == Prog::kUnanchored ||
|
||||
!prog_->IsOnePass() ||
|
||||
nsubmatch > Prog::kMaxOnePassCapture) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
|
||||
result->submatch, nsubmatch);
|
||||
result->have_submatch = true;
|
||||
break;
|
||||
|
||||
case kEngineBitState:
|
||||
if (prog_ == NULL) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
result->matched = prog_->SearchBitState(text, context, anchor, kind_,
|
||||
result->submatch, nsubmatch);
|
||||
result->have_submatch = true;
|
||||
break;
|
||||
|
||||
case kEngineRE2:
|
||||
case kEngineRE2a:
|
||||
case kEngineRE2b: {
|
||||
if (!re2_ || text.end() != context.end()) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
|
||||
RE2::Anchor re_anchor;
|
||||
if (anchor == Prog::kAnchored)
|
||||
re_anchor = RE2::ANCHOR_START;
|
||||
else
|
||||
re_anchor = RE2::UNANCHORED;
|
||||
if (kind_ == Prog::kFullMatch)
|
||||
re_anchor = RE2::ANCHOR_BOTH;
|
||||
|
||||
result->matched = re2_->Match(context,
|
||||
text.begin() - context.begin(),
|
||||
text.end() - context.begin(),
|
||||
re_anchor, result->submatch, nsubmatch);
|
||||
result->have_submatch = nsubmatch > 0;
|
||||
break;
|
||||
}
|
||||
|
||||
case kEnginePCRE: {
|
||||
if (!re_ || text.begin() != context.begin() ||
|
||||
text.end() != context.end()) {
|
||||
result->skipped = true;
|
||||
break;
|
||||
}
|
||||
|
||||
const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
|
||||
PCRE::Arg *a = new PCRE::Arg[nsubmatch];
|
||||
for (int i = 0; i < nsubmatch; i++) {
|
||||
a[i] = PCRE::Arg(&result->submatch[i]);
|
||||
argptr[i] = &a[i];
|
||||
}
|
||||
int consumed;
|
||||
PCRE::Anchor pcre_anchor;
|
||||
if (anchor == Prog::kAnchored)
|
||||
pcre_anchor = PCRE::ANCHOR_START;
|
||||
else
|
||||
pcre_anchor = PCRE::UNANCHORED;
|
||||
if (kind_ == Prog::kFullMatch)
|
||||
pcre_anchor = PCRE::ANCHOR_BOTH;
|
||||
re_->ClearHitLimit();
|
||||
result->matched =
|
||||
re_->DoMatch(text,
|
||||
pcre_anchor,
|
||||
&consumed,
|
||||
argptr, nsubmatch);
|
||||
if (re_->HitLimit()) {
|
||||
result->untrusted = true;
|
||||
delete[] argptr;
|
||||
delete[] a;
|
||||
break;
|
||||
}
|
||||
result->have_submatch = true;
|
||||
|
||||
// Work around RE interface bug: PCRE returns -1 as the
|
||||
// offsets for an unmatched subexpression, and RE should
|
||||
// turn that into StringPiece(NULL) but in fact it uses
|
||||
// StringPiece(text.begin() - 1, 0). Oops.
|
||||
for (int i = 0; i < nsubmatch; i++)
|
||||
if (result->submatch[i].begin() == text.begin() - 1)
|
||||
result->submatch[i] = NULL;
|
||||
delete[] argptr;
|
||||
delete[] a;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!result->matched)
|
||||
memset(result->submatch, 0, sizeof result->submatch);
|
||||
}
|
||||
|
||||
// Checks whether r is okay given that correct is the right answer.
|
||||
// Specifically, r's answers have to match (but it doesn't have to
|
||||
// claim to have all the answers).
|
||||
static bool ResultOkay(const Result& r, const Result& correct) {
|
||||
if (r.skipped)
|
||||
return true;
|
||||
if (r.matched != correct.matched)
|
||||
return false;
|
||||
if (r.have_submatch || r.have_submatch0) {
|
||||
for (int i = 0; i < kMaxSubmatch; i++) {
|
||||
if (correct.submatch[i].begin() != r.submatch[i].begin() ||
|
||||
correct.submatch[i].size() != r.submatch[i].size())
|
||||
return false;
|
||||
if (!r.have_submatch)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Runs a single test.
|
||||
bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
|
||||
Prog::Anchor anchor) {
|
||||
// Backtracking is the gold standard.
|
||||
Result correct;
|
||||
RunSearch(kEngineBacktrack, text, context, anchor, &correct);
|
||||
if (correct.skipped) {
|
||||
if (regexp_ == NULL)
|
||||
return true;
|
||||
LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
|
||||
<< " " << FormatMode(flags_);
|
||||
return false;
|
||||
}
|
||||
VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
|
||||
<< " text " << CEscape(text)
|
||||
<< " (" << FormatKind(kind_)
|
||||
<< ", " << FormatAnchor(anchor)
|
||||
<< ", " << FormatMode(flags_)
|
||||
<< ")";
|
||||
|
||||
// Compare the others.
|
||||
bool all_okay = true;
|
||||
for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
|
||||
if (!(Engines() & (1<<i)))
|
||||
continue;
|
||||
|
||||
Result r;
|
||||
RunSearch(i, text, context, anchor, &r);
|
||||
if (ResultOkay(r, correct)) {
|
||||
if (FLAGS_log_okay)
|
||||
LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
|
||||
continue;
|
||||
}
|
||||
|
||||
// We disagree with PCRE on the meaning of some Unicode matches.
|
||||
// In particular, we treat all non-ASCII UTF-8 as word characters.
|
||||
// We also treat "empty" character sets like [^\w\W] as being
|
||||
// impossible to match, while PCRE apparently excludes some code
|
||||
// points (e.g., 0x0080) from both \w and \W.
|
||||
if (i == kEnginePCRE && NonASCII(text))
|
||||
continue;
|
||||
|
||||
if (!r.untrusted)
|
||||
all_okay = false;
|
||||
|
||||
LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
|
||||
context, anchor);
|
||||
if (r.matched != correct.matched) {
|
||||
if (r.matched) {
|
||||
LOG(INFO) << " Should not match (but does).";
|
||||
} else {
|
||||
LOG(INFO) << " Should match (but does not).";
|
||||
continue;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < 1+num_captures_; i++) {
|
||||
if (r.submatch[i].begin() != correct.submatch[i].begin() ||
|
||||
r.submatch[i].end() != correct.submatch[i].end()) {
|
||||
LOG(INFO) <<
|
||||
StringPrintf(" $%d: should be %s is %s",
|
||||
i,
|
||||
FormatCapture(text, correct.submatch[i]).c_str(),
|
||||
FormatCapture(text, r.submatch[i]).c_str());
|
||||
} else {
|
||||
LOG(INFO) <<
|
||||
StringPrintf(" $%d: %s ok", i,
|
||||
FormatCapture(text, r.submatch[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!all_okay) {
|
||||
if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
|
||||
LOG(QFATAL) << "Too many regexp failures.";
|
||||
}
|
||||
|
||||
return all_okay;
|
||||
}
|
||||
|
||||
void TestInstance::LogMatch(const char* prefix, Engine e,
|
||||
const StringPiece& text, const StringPiece& context,
|
||||
Prog::Anchor anchor) {
|
||||
LOG(INFO) << prefix
|
||||
<< EngineString(e)
|
||||
<< " regexp "
|
||||
<< CEscape(regexp_str_)
|
||||
<< " "
|
||||
<< CEscape(regexp_->ToString())
|
||||
<< " text "
|
||||
<< CEscape(text)
|
||||
<< " ("
|
||||
<< text.begin() - context.begin()
|
||||
<< ","
|
||||
<< text.end() - context.begin()
|
||||
<< ") of context "
|
||||
<< CEscape(context)
|
||||
<< " (" << FormatKind(kind_)
|
||||
<< ", " << FormatAnchor(anchor)
|
||||
<< ", " << FormatMode(flags_)
|
||||
<< ")";
|
||||
}
|
||||
|
||||
static Prog::MatchKind kinds[] = {
|
||||
Prog::kFirstMatch,
|
||||
Prog::kLongestMatch,
|
||||
Prog::kFullMatch,
|
||||
};
|
||||
|
||||
// Test all possible match kinds and parse modes.
|
||||
Tester::Tester(const StringPiece& regexp) {
|
||||
error_ = false;
|
||||
for (int i = 0; i < arraysize(kinds); i++) {
|
||||
for (int j = 0; j < arraysize(parse_modes); j++) {
|
||||
TestInstance* t = new TestInstance(regexp, kinds[i],
|
||||
parse_modes[j].parse_flags);
|
||||
error_ |= t->error();
|
||||
v_.push_back(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Tester::~Tester() {
|
||||
for (int i = 0; i < v_.size(); i++)
|
||||
delete v_[i];
|
||||
}
|
||||
|
||||
bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
|
||||
Prog::Anchor anchor) {
|
||||
bool okay = true;
|
||||
for (int i = 0; i < v_.size(); i++)
|
||||
okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
|
||||
return okay;
|
||||
}
|
||||
|
||||
static Prog::Anchor anchors[] = {
|
||||
Prog::kAnchored,
|
||||
Prog::kUnanchored
|
||||
};
|
||||
|
||||
bool Tester::TestInput(const StringPiece& text) {
|
||||
bool okay = TestInputInContext(text, text);
|
||||
if (text.size() > 0) {
|
||||
StringPiece sp;
|
||||
sp = text;
|
||||
sp.remove_prefix(1);
|
||||
okay &= TestInputInContext(sp, text);
|
||||
sp = text;
|
||||
sp.remove_suffix(1);
|
||||
okay &= TestInputInContext(sp, text);
|
||||
}
|
||||
return okay;
|
||||
}
|
||||
|
||||
bool Tester::TestInputInContext(const StringPiece& text,
|
||||
const StringPiece& context) {
|
||||
bool okay = true;
|
||||
for (int i = 0; i < arraysize(anchors); i++)
|
||||
okay &= TestCase(text, context, anchors[i]);
|
||||
return okay;
|
||||
}
|
||||
|
||||
bool TestRegexpOnText(const StringPiece& regexp,
|
||||
const StringPiece& text) {
|
||||
Tester t(regexp);
|
||||
return t.TestInput(text);
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,121 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Comparative tester for regular expression matching.
|
||||
// Checks all implementations against each other.
|
||||
|
||||
#ifndef RE2_TESTING_TESTER_H__
|
||||
#define RE2_TESTING_TESTER_H__
|
||||
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/re2.h"
|
||||
#include "util/pcre.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class Regexp;
|
||||
|
||||
// All the supported regexp engines.
|
||||
enum Engine {
|
||||
kEngineBacktrack = 0, // Prog::BadSearchBacktrack
|
||||
kEngineNFA, // Prog::SearchNFA
|
||||
kEngineDFA, // Prog::SearchDFA, only ask whether it matched
|
||||
kEngineDFA1, // Prog::SearchDFA, ask for match[0]
|
||||
kEngineOnePass, // Prog::SearchOnePass, if applicable
|
||||
kEngineBitState, // Prog::SearchBitState
|
||||
kEngineRE2, // RE2, all submatches
|
||||
kEngineRE2a, // RE2, only ask for match[0]
|
||||
kEngineRE2b, // RE2, only ask whether it matched
|
||||
kEnginePCRE, // PCRE (util/pcre.h)
|
||||
|
||||
kEngineMax,
|
||||
};
|
||||
|
||||
// Make normal math on the enum preserve the type.
|
||||
// By default, C++ doesn't define ++ on enum, and e+1 has type int.
|
||||
static inline void operator++(Engine& e, int unused) {
|
||||
e = static_cast<Engine>(e+1);
|
||||
}
|
||||
|
||||
static inline Engine operator+(Engine e, int i) {
|
||||
return static_cast<Engine>(static_cast<int>(e)+i);
|
||||
}
|
||||
|
||||
// A TestInstance caches per-regexp state for a given
|
||||
// regular expression in a given configuration
|
||||
// (UTF-8 vs Latin1, longest vs first match, etc.).
|
||||
class TestInstance {
|
||||
public:
|
||||
struct Result;
|
||||
|
||||
TestInstance(const StringPiece& regexp, Prog::MatchKind kind,
|
||||
Regexp::ParseFlags flags);
|
||||
~TestInstance();
|
||||
Regexp::ParseFlags flags() { return flags_; }
|
||||
bool error() { return error_; }
|
||||
|
||||
// Runs a single test case: search in text, which is in context,
|
||||
// using the given anchoring.
|
||||
bool RunCase(const StringPiece& text, const StringPiece& context,
|
||||
Prog::Anchor anchor);
|
||||
|
||||
private:
|
||||
// Runs a single search using the named engine type.
|
||||
void RunSearch(Engine type,
|
||||
const StringPiece& text, const StringPiece& context,
|
||||
Prog::Anchor anchor,
|
||||
Result *result);
|
||||
|
||||
void LogMatch(const char* prefix, Engine e, const StringPiece& text,
|
||||
const StringPiece& context, Prog::Anchor anchor);
|
||||
|
||||
const StringPiece& regexp_str_; // regexp being tested
|
||||
Prog::MatchKind kind_; // kind of match
|
||||
Regexp::ParseFlags flags_; // flags for parsing regexp_str_
|
||||
bool error_; // error during constructor?
|
||||
|
||||
Regexp* regexp_; // parsed regexp
|
||||
int num_captures_; // regexp_->NumCaptures() cached
|
||||
Prog* prog_; // compiled program
|
||||
Prog* rprog_; // compiled reverse program
|
||||
PCRE* re_; // PCRE implementation
|
||||
RE2* re2_; // RE2 implementation
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(TestInstance);
|
||||
};
|
||||
|
||||
// A group of TestInstances for all possible configurations.
|
||||
class Tester {
|
||||
public:
|
||||
explicit Tester(const StringPiece& regexp);
|
||||
~Tester();
|
||||
|
||||
bool error() { return error_; }
|
||||
|
||||
// Runs a single test case: search in text, which is in context,
|
||||
// using the given anchoring.
|
||||
bool TestCase(const StringPiece& text, const StringPiece& context,
|
||||
Prog::Anchor anchor);
|
||||
|
||||
// Run TestCase(text, text, anchor) for all anchoring modes.
|
||||
bool TestInput(const StringPiece& text);
|
||||
|
||||
// Run TestCase(text, context, anchor) for all anchoring modes.
|
||||
bool TestInputInContext(const StringPiece& text, const StringPiece& context);
|
||||
|
||||
private:
|
||||
bool error_;
|
||||
vector<TestInstance*> v_;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Tester);
|
||||
};
|
||||
|
||||
// Run all possible tests using regexp and text.
|
||||
bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_TESTER_H__
|
|
@ -0,0 +1,207 @@
|
|||
#!/usr/bin/python2.4
|
||||
#
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
"""Unittest for the util/regexp/re2/unicode.py module."""
|
||||
|
||||
import os
|
||||
import StringIO
|
||||
from google3.pyglib import flags
|
||||
from google3.testing.pybase import googletest
|
||||
from google3.util.regexp.re2 import unicode
|
||||
|
||||
_UNICODE_DIR = os.path.join(flags.FLAGS.test_srcdir, "google3", "third_party",
|
||||
"unicode", "ucd-5.1.0")
|
||||
|
||||
|
||||
class ConvertTest(googletest.TestCase):
|
||||
"""Test the conversion functions."""
|
||||
|
||||
def testUInt(self):
|
||||
self.assertEquals(0x0000, unicode._UInt("0000"))
|
||||
self.assertEquals(0x263A, unicode._UInt("263A"))
|
||||
self.assertEquals(0x10FFFF, unicode._UInt("10FFFF"))
|
||||
self.assertRaises(unicode.InputError, unicode._UInt, "263")
|
||||
self.assertRaises(unicode.InputError, unicode._UInt, "263AAAA")
|
||||
self.assertRaises(unicode.InputError, unicode._UInt, "110000")
|
||||
|
||||
def testURange(self):
|
||||
self.assertEquals([1, 2, 3], unicode._URange("0001..0003"))
|
||||
self.assertEquals([1], unicode._URange("0001"))
|
||||
self.assertRaises(unicode.InputError, unicode._URange, "0001..0003..0005")
|
||||
self.assertRaises(unicode.InputError, unicode._URange, "0003..0001")
|
||||
self.assertRaises(unicode.InputError, unicode._URange, "0001..0001")
|
||||
|
||||
def testUStr(self):
|
||||
self.assertEquals("0x263A", unicode._UStr(0x263a))
|
||||
self.assertEquals("0x10FFFF", unicode._UStr(0x10FFFF))
|
||||
self.assertRaises(unicode.InputError, unicode._UStr, 0x110000)
|
||||
self.assertRaises(unicode.InputError, unicode._UStr, -1)
|
||||
|
||||
|
||||
_UNICODE_TABLE = """# Commented line, should be ignored.
|
||||
# The next line is blank and should be ignored.
|
||||
|
||||
0041;Capital A;Line 1
|
||||
0061..007A;Lowercase;Line 2
|
||||
1F00;<Greek, First>;Ignored
|
||||
1FFE;<Greek, Last>;Line 3
|
||||
10FFFF;Runemax;Line 4
|
||||
0000;Zero;Line 5
|
||||
"""
|
||||
|
||||
_BAD_TABLE1 = """
|
||||
111111;Not a code point;
|
||||
"""
|
||||
|
||||
_BAD_TABLE2 = """
|
||||
0000;<Zero, First>;Missing <Zero, Last>
|
||||
"""
|
||||
|
||||
_BAD_TABLE3 = """
|
||||
0010..0001;Bad range;
|
||||
"""
|
||||
|
||||
|
||||
class AbortError(Exception):
|
||||
"""Function should not have been called."""
|
||||
|
||||
|
||||
def Abort():
|
||||
raise AbortError("Abort")
|
||||
|
||||
|
||||
def StringTable(s, n, f):
|
||||
unicode.ReadUnicodeTable(StringIO.StringIO(s), n, f)
|
||||
|
||||
|
||||
class ReadUnicodeTableTest(googletest.TestCase):
|
||||
"""Test the ReadUnicodeTable function."""
|
||||
|
||||
def testSimpleTable(self):
|
||||
|
||||
ncall = [0] # can't assign to ordinary int in DoLine
|
||||
|
||||
def DoLine(codes, fields):
|
||||
self.assertEquals(3, len(fields))
|
||||
ncall[0] += 1
|
||||
self.assertEquals("Line %d" % (ncall[0],), fields[2])
|
||||
if ncall[0] == 1:
|
||||
self.assertEquals([0x0041], codes)
|
||||
self.assertEquals("0041", fields[0])
|
||||
self.assertEquals("Capital A", fields[1])
|
||||
elif ncall[0] == 2:
|
||||
self.assertEquals(range(0x0061, 0x007A + 1), codes)
|
||||
self.assertEquals("0061..007A", fields[0])
|
||||
self.assertEquals("Lowercase", fields[1])
|
||||
elif ncall[0] == 3:
|
||||
self.assertEquals(range(0x1F00, 0x1FFE + 1), codes)
|
||||
self.assertEquals("1F00..1FFE", fields[0])
|
||||
self.assertEquals("Greek", fields[1])
|
||||
elif ncall[0] == 4:
|
||||
self.assertEquals([0x10FFFF], codes)
|
||||
self.assertEquals("10FFFF", fields[0])
|
||||
self.assertEquals("Runemax", fields[1])
|
||||
elif ncall[0] == 5:
|
||||
self.assertEquals([0x0000], codes)
|
||||
self.assertEquals("0000", fields[0])
|
||||
self.assertEquals("Zero", fields[1])
|
||||
|
||||
StringTable(_UNICODE_TABLE, 3, DoLine)
|
||||
self.assertEquals(5, ncall[0])
|
||||
|
||||
def testErrorTables(self):
|
||||
self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 4, Abort)
|
||||
self.assertRaises(unicode.InputError, StringTable, _UNICODE_TABLE, 2, Abort)
|
||||
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE1, 3, Abort)
|
||||
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE2, 3, Abort)
|
||||
self.assertRaises(unicode.InputError, StringTable, _BAD_TABLE3, 3, Abort)
|
||||
|
||||
|
||||
class ParseContinueTest(googletest.TestCase):
|
||||
"""Test the ParseContinue function."""
|
||||
|
||||
def testParseContinue(self):
|
||||
self.assertEquals(("Private Use", "First"),
|
||||
unicode._ParseContinue("<Private Use, First>"))
|
||||
self.assertEquals(("Private Use", "Last"),
|
||||
unicode._ParseContinue("<Private Use, Last>"))
|
||||
self.assertEquals(("<Private Use, Blah>", None),
|
||||
unicode._ParseContinue("<Private Use, Blah>"))
|
||||
|
||||
|
||||
class CaseGroupsTest(googletest.TestCase):
|
||||
"""Test the CaseGroups function (and the CaseFoldingReader)."""
|
||||
|
||||
def FindGroup(self, c):
|
||||
if type(c) == str:
|
||||
c = ord(c)
|
||||
for g in self.groups:
|
||||
if c in g:
|
||||
return g
|
||||
return None
|
||||
|
||||
def testCaseGroups(self):
|
||||
self.groups = unicode.CaseGroups(unicode_dir=_UNICODE_DIR)
|
||||
self.assertEquals([ord("A"), ord("a")], self.FindGroup("a"))
|
||||
self.assertEquals(None, self.FindGroup("0"))
|
||||
|
||||
|
||||
class ScriptsTest(googletest.TestCase):
|
||||
"""Test the Scripts function (and the ScriptsReader)."""
|
||||
|
||||
def FindScript(self, c):
|
||||
if type(c) == str:
|
||||
c = ord(c)
|
||||
for script, codes in self.scripts.items():
|
||||
for code in codes:
|
||||
if c == code:
|
||||
return script
|
||||
return None
|
||||
|
||||
def testScripts(self):
|
||||
self.scripts = unicode.Scripts(unicode_dir=_UNICODE_DIR)
|
||||
self.assertEquals("Latin", self.FindScript("a"))
|
||||
self.assertEquals("Common", self.FindScript("0"))
|
||||
self.assertEquals(None, self.FindScript(0xFFFE))
|
||||
|
||||
|
||||
class CategoriesTest(googletest.TestCase):
|
||||
"""Test the Categories function (and the UnicodeDataReader)."""
|
||||
|
||||
def FindCategory(self, c):
|
||||
if type(c) == str:
|
||||
c = ord(c)
|
||||
short = None
|
||||
for category, codes in self.categories.items():
|
||||
for code in codes:
|
||||
if code == c:
|
||||
# prefer category Nd over N
|
||||
if len(category) > 1:
|
||||
return category
|
||||
if short == None:
|
||||
short = category
|
||||
return short
|
||||
|
||||
def testCategories(self):
|
||||
self.categories = unicode.Categories(unicode_dir=_UNICODE_DIR)
|
||||
self.assertEquals("Ll", self.FindCategory("a"))
|
||||
self.assertEquals("Nd", self.FindCategory("0"))
|
||||
self.assertEquals("Lo", self.FindCategory(0xAD00)) # in First, Last range
|
||||
self.assertEquals(None, self.FindCategory(0xFFFE))
|
||||
self.assertEquals("Lo", self.FindCategory(0x8B5A))
|
||||
self.assertEquals("Lo", self.FindCategory(0x6C38))
|
||||
self.assertEquals("Lo", self.FindCategory(0x92D2))
|
||||
self.assertTrue(ord("a") in self.categories["L"])
|
||||
self.assertTrue(ord("0") in self.categories["N"])
|
||||
self.assertTrue(0x8B5A in self.categories["L"])
|
||||
self.assertTrue(0x6C38 in self.categories["L"])
|
||||
self.assertTrue(0x92D2 in self.categories["L"])
|
||||
|
||||
def main():
|
||||
googletest.main()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,341 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Format a regular expression structure as a string.
|
||||
// Tested by parse_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
enum {
|
||||
PrecAtom,
|
||||
PrecUnary,
|
||||
PrecConcat,
|
||||
PrecAlternate,
|
||||
PrecEmpty,
|
||||
PrecParen,
|
||||
PrecToplevel,
|
||||
};
|
||||
|
||||
// Helper function. See description below.
|
||||
static void AppendCCRange(string* t, Rune lo, Rune hi);
|
||||
|
||||
// Walker to generate string in s_.
|
||||
// The arg pointers are actually integers giving the
|
||||
// context precedence.
|
||||
// The child_args are always NULL.
|
||||
class ToStringWalker : public Regexp::Walker<int> {
|
||||
public:
|
||||
explicit ToStringWalker(string* t) : t_(t) {}
|
||||
|
||||
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
|
||||
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
||||
int* child_args, int nchild_args);
|
||||
virtual int ShortVisit(Regexp* re, int parent_arg) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
string* t_; // The string the walker appends to.
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
|
||||
};
|
||||
|
||||
string Regexp::ToString() {
|
||||
string t;
|
||||
ToStringWalker w(&t);
|
||||
w.WalkExponential(this, PrecToplevel, 100000);
|
||||
if (w.stopped_early())
|
||||
t += " [truncated]";
|
||||
return t;
|
||||
}
|
||||
|
||||
#define ToString DontCallToString // Avoid accidental recursion.
|
||||
|
||||
// Visits re before children are processed.
|
||||
// Appends ( if needed and passes new precedence to children.
|
||||
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
||||
int prec = parent_arg;
|
||||
int nprec = PrecAtom;
|
||||
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpEndText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpHaveMatch:
|
||||
nprec = PrecAtom;
|
||||
break;
|
||||
|
||||
case kRegexpConcat:
|
||||
case kRegexpLiteralString:
|
||||
if (prec < PrecConcat)
|
||||
t_->append("(?:");
|
||||
nprec = PrecConcat;
|
||||
break;
|
||||
|
||||
case kRegexpAlternate:
|
||||
if (prec < PrecAlternate)
|
||||
t_->append("(?:");
|
||||
nprec = PrecAlternate;
|
||||
break;
|
||||
|
||||
case kRegexpCapture:
|
||||
t_->append("(");
|
||||
if (re->name()) {
|
||||
t_->append("?P<");
|
||||
t_->append(*re->name());
|
||||
t_->append(">");
|
||||
}
|
||||
nprec = PrecParen;
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
if (prec < PrecUnary)
|
||||
t_->append("(?:");
|
||||
// The subprecedence here is PrecAtom instead of PrecUnary
|
||||
// because PCRE treats two unary ops in a row as a parse error.
|
||||
nprec = PrecAtom;
|
||||
break;
|
||||
}
|
||||
|
||||
return nprec;
|
||||
}
|
||||
|
||||
static void AppendLiteral(string *t, Rune r, bool foldcase) {
|
||||
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
|
||||
t->append(1, '\\');
|
||||
t->append(1, r);
|
||||
} else if (foldcase && 'a' <= r && r <= 'z') {
|
||||
if ('a' <= r && r <= 'z')
|
||||
r += 'A' - 'a';
|
||||
t->append(1, '[');
|
||||
t->append(1, r);
|
||||
t->append(1, r + 'a' - 'A');
|
||||
t->append(1, ']');
|
||||
} else {
|
||||
AppendCCRange(t, r, r);
|
||||
}
|
||||
}
|
||||
|
||||
// Visits re after children are processed.
|
||||
// For childless regexps, all the work is done here.
|
||||
// For regexps with children, append any unary suffixes or ).
|
||||
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
||||
int* child_args, int nchild_args) {
|
||||
int prec = parent_arg;
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch:
|
||||
// There's no simple symbol for "no match", but
|
||||
// [^0-Runemax] excludes everything.
|
||||
t_->append("[^\\x00-\\x{10ffff}]");
|
||||
break;
|
||||
|
||||
case kRegexpEmptyMatch:
|
||||
// Append (?:) to make empty string visible,
|
||||
// unless this is already being parenthesized.
|
||||
if (prec < PrecEmpty)
|
||||
t_->append("(?:)");
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
for (int i = 0; i < re->nrunes(); i++)
|
||||
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
|
||||
if (prec < PrecConcat)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpConcat:
|
||||
if (prec < PrecConcat)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpAlternate:
|
||||
// Clumsy but workable: the children all appended |
|
||||
// at the end of their strings, so just remove the last one.
|
||||
if ((*t_)[t_->size()-1] == '|')
|
||||
t_->erase(t_->size()-1);
|
||||
else
|
||||
LOG(DFATAL) << "Bad final char: " << t_;
|
||||
if (prec < PrecAlternate)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
t_->append("*");
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
t_->append("?");
|
||||
if (prec < PrecUnary)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
t_->append("+");
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
t_->append("?");
|
||||
if (prec < PrecUnary)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
t_->append("?");
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
t_->append("?");
|
||||
if (prec < PrecUnary)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpRepeat:
|
||||
if (re->max() == -1)
|
||||
t_->append(StringPrintf("{%d,}", re->min()));
|
||||
else if (re->min() == re->max())
|
||||
t_->append(StringPrintf("{%d}", re->min()));
|
||||
else
|
||||
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
t_->append("?");
|
||||
if (prec < PrecUnary)
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpAnyChar:
|
||||
t_->append(".");
|
||||
break;
|
||||
|
||||
case kRegexpAnyByte:
|
||||
t_->append("\\C");
|
||||
break;
|
||||
|
||||
case kRegexpBeginLine:
|
||||
t_->append("^");
|
||||
break;
|
||||
|
||||
case kRegexpEndLine:
|
||||
t_->append("$");
|
||||
break;
|
||||
|
||||
case kRegexpBeginText:
|
||||
t_->append("(?-m:^)");
|
||||
break;
|
||||
|
||||
case kRegexpEndText:
|
||||
if (re->parse_flags() & Regexp::WasDollar)
|
||||
t_->append("(?-m:$)");
|
||||
else
|
||||
t_->append("\\z");
|
||||
break;
|
||||
|
||||
case kRegexpWordBoundary:
|
||||
t_->append("\\b");
|
||||
break;
|
||||
|
||||
case kRegexpNoWordBoundary:
|
||||
t_->append("\\B");
|
||||
break;
|
||||
|
||||
case kRegexpCharClass: {
|
||||
if (re->cc()->size() == 0) {
|
||||
t_->append("[^\\x00-\\x{10ffff}]");
|
||||
break;
|
||||
}
|
||||
t_->append("[");
|
||||
// Heuristic: show class as negated if it contains the
|
||||
// non-character 0xFFFE.
|
||||
CharClass* cc = re->cc();
|
||||
if (cc->Contains(0xFFFE)) {
|
||||
cc = cc->Negate();
|
||||
t_->append("^");
|
||||
}
|
||||
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
|
||||
AppendCCRange(t_, i->lo, i->hi);
|
||||
if (cc != re->cc())
|
||||
cc->Delete();
|
||||
t_->append("]");
|
||||
break;
|
||||
}
|
||||
|
||||
case kRegexpCapture:
|
||||
t_->append(")");
|
||||
break;
|
||||
|
||||
case kRegexpHaveMatch:
|
||||
// There's no syntax accepted by the parser to generate
|
||||
// this node (it is generated by RE2::Set) so make something
|
||||
// up that is readable but won't compile.
|
||||
t_->append("(?HaveMatch:%d)", re->match_id());
|
||||
break;
|
||||
}
|
||||
|
||||
// If the parent is an alternation, append the | for it.
|
||||
if (prec == PrecAlternate)
|
||||
t_->append("|");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Appends a rune for use in a character class to the string t.
|
||||
static void AppendCCChar(string* t, Rune r) {
|
||||
if (0x20 <= r && r <= 0x7E) {
|
||||
if (strchr("[]^-\\", r))
|
||||
t->append("\\");
|
||||
t->append(1, r);
|
||||
return;
|
||||
}
|
||||
switch (r) {
|
||||
default:
|
||||
break;
|
||||
|
||||
case '\r':
|
||||
t->append("\\r");
|
||||
return;
|
||||
|
||||
case '\t':
|
||||
t->append("\\t");
|
||||
return;
|
||||
|
||||
case '\n':
|
||||
t->append("\\n");
|
||||
return;
|
||||
|
||||
case '\f':
|
||||
t->append("\\f");
|
||||
return;
|
||||
}
|
||||
|
||||
if (r < 0x100) {
|
||||
StringAppendF(t, "\\x%02x", static_cast<int>(r));
|
||||
return;
|
||||
}
|
||||
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
|
||||
}
|
||||
|
||||
static void AppendCCRange(string* t, Rune lo, Rune hi) {
|
||||
if (lo > hi)
|
||||
return;
|
||||
AppendCCChar(t, lo);
|
||||
if (lo < hi) {
|
||||
t->append("-");
|
||||
AppendCCChar(t, hi);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,297 @@
|
|||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
"""Parser for Unicode data files (as distributed by unicode.org)."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
# Directory or URL where Unicode tables reside.
|
||||
_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd"
|
||||
|
||||
# Largest valid Unicode code value.
|
||||
_RUNE_MAX = 0x10FFFF
|
||||
|
||||
|
||||
class Error(Exception):
|
||||
"""Unicode error base class."""
|
||||
|
||||
|
||||
class InputError(Error):
|
||||
"""Unicode input error class. Raised on invalid input."""
|
||||
|
||||
|
||||
def _UInt(s):
|
||||
"""Converts string to Unicode code point ('263A' => 0x263a).
|
||||
|
||||
Args:
|
||||
s: string to convert
|
||||
|
||||
Returns:
|
||||
Unicode code point
|
||||
|
||||
Raises:
|
||||
InputError: the string is not a valid Unicode value.
|
||||
"""
|
||||
|
||||
try:
|
||||
v = int(s, 16)
|
||||
except ValueError:
|
||||
v = -1
|
||||
if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
|
||||
raise InputError("invalid Unicode value %s" % (s,))
|
||||
return v
|
||||
|
||||
|
||||
def _URange(s):
|
||||
"""Converts string to Unicode range.
|
||||
|
||||
'0001..0003' => [1, 2, 3].
|
||||
'0001' => [1].
|
||||
|
||||
Args:
|
||||
s: string to convert
|
||||
|
||||
Returns:
|
||||
Unicode range
|
||||
|
||||
Raises:
|
||||
InputError: the string is not a valid Unicode range.
|
||||
"""
|
||||
a = s.split("..")
|
||||
if len(a) == 1:
|
||||
return [_UInt(a[0])]
|
||||
if len(a) == 2:
|
||||
lo = _UInt(a[0])
|
||||
hi = _UInt(a[1])
|
||||
if lo < hi:
|
||||
return range(lo, hi + 1)
|
||||
raise InputError("invalid Unicode range %s" % (s,))
|
||||
|
||||
|
||||
def _UStr(v):
|
||||
"""Converts Unicode code point to hex string.
|
||||
|
||||
0x263a => '0x263A'.
|
||||
|
||||
Args:
|
||||
v: code point to convert
|
||||
|
||||
Returns:
|
||||
Unicode string
|
||||
|
||||
Raises:
|
||||
InputError: the argument is not a valid Unicode value.
|
||||
"""
|
||||
if v < 0 or v > _RUNE_MAX:
|
||||
raise InputError("invalid Unicode value %s" % (v,))
|
||||
return "0x%04X" % (v,)
|
||||
|
||||
|
||||
def _ParseContinue(s):
|
||||
"""Parses a Unicode continuation field.
|
||||
|
||||
These are of the form '<Name, First>' or '<Name, Last>'.
|
||||
Instead of giving an explicit range in a single table entry,
|
||||
some Unicode tables use two entries, one for the first
|
||||
code value in the range and one for the last.
|
||||
The first entry's description is '<Name, First>' instead of 'Name'
|
||||
and the second is '<Name, Last>'.
|
||||
|
||||
'<Name, First>' => ('Name', 'First')
|
||||
'<Name, Last>' => ('Name', 'Last')
|
||||
'Anything else' => ('Anything else', None)
|
||||
|
||||
Args:
|
||||
s: continuation field string
|
||||
|
||||
Returns:
|
||||
pair: name and ('First', 'Last', or None)
|
||||
"""
|
||||
|
||||
match = re.match("<(.*), (First|Last)>", s)
|
||||
if match is not None:
|
||||
return match.groups()
|
||||
return (s, None)
|
||||
|
||||
|
||||
def ReadUnicodeTable(filename, nfields, doline):
|
||||
"""Generic Unicode table text file reader.
|
||||
|
||||
The reader takes care of stripping out comments and also
|
||||
parsing the two different ways that the Unicode tables specify
|
||||
code ranges (using the .. notation and splitting the range across
|
||||
multiple lines).
|
||||
|
||||
Each non-comment line in the table is expected to have the given
|
||||
number of fields. The first field is known to be the Unicode value
|
||||
and the second field its description.
|
||||
|
||||
The reader calls doline(codes, fields) for each entry in the table.
|
||||
If fn raises an exception, the reader prints that exception,
|
||||
prefixed with the file name and line number, and continues
|
||||
processing the file. When done with the file, the reader re-raises
|
||||
the first exception encountered during the file.
|
||||
|
||||
Arguments:
|
||||
filename: the Unicode data file to read, or a file-like object.
|
||||
nfields: the number of expected fields per line in that file.
|
||||
doline: the function to call for each table entry.
|
||||
|
||||
Raises:
|
||||
InputError: nfields is invalid (must be >= 2).
|
||||
"""
|
||||
|
||||
if nfields < 2:
|
||||
raise InputError("invalid number of fields %d" % (nfields,))
|
||||
|
||||
if type(filename) == str:
|
||||
if filename.startswith("http://"):
|
||||
fil = urllib2.urlopen(filename)
|
||||
else:
|
||||
fil = open(filename, "r")
|
||||
else:
|
||||
fil = filename
|
||||
|
||||
first = None # first code in multiline range
|
||||
expect_last = None # tag expected for "Last" line in multiline range
|
||||
lineno = 0 # current line number
|
||||
for line in fil:
|
||||
lineno += 1
|
||||
try:
|
||||
# Chop # comments and white space; ignore empty lines.
|
||||
sharp = line.find("#")
|
||||
if sharp >= 0:
|
||||
line = line[:sharp]
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Split fields on ";", chop more white space.
|
||||
# Must have the expected number of fields.
|
||||
fields = [s.strip() for s in line.split(";")]
|
||||
if len(fields) != nfields:
|
||||
raise InputError("wrong number of fields %d %d - %s" %
|
||||
(len(fields), nfields, line))
|
||||
|
||||
# The Unicode text files have two different ways
|
||||
# to list a Unicode range. Either the first field is
|
||||
# itself a range (0000..FFFF), or the range is split
|
||||
# across two lines, with the second field noting
|
||||
# the continuation.
|
||||
codes = _URange(fields[0])
|
||||
(name, cont) = _ParseContinue(fields[1])
|
||||
|
||||
if expect_last is not None:
|
||||
# If the last line gave the First code in a range,
|
||||
# this one had better give the Last one.
|
||||
if (len(codes) != 1 or codes[0] <= first or
|
||||
cont != "Last" or name != expect_last):
|
||||
raise InputError("expected Last line for %s" %
|
||||
(expect_last,))
|
||||
codes = range(first, codes[0] + 1)
|
||||
first = None
|
||||
expect_last = None
|
||||
fields[0] = "%04X..%04X" % (codes[0], codes[-1])
|
||||
fields[1] = name
|
||||
elif cont == "First":
|
||||
# Otherwise, if this is the First code in a range,
|
||||
# remember it and go to the next line.
|
||||
if len(codes) != 1:
|
||||
raise InputError("bad First line: range given")
|
||||
expect_last = name
|
||||
first = codes[0]
|
||||
continue
|
||||
|
||||
doline(codes, fields)
|
||||
|
||||
except Exception, e:
|
||||
print "%s:%d: %s" % (filename, lineno, e)
|
||||
raise
|
||||
|
||||
if expect_last is not None:
|
||||
raise InputError("expected Last line for %s; got EOF" %
|
||||
(expect_last,))
|
||||
|
||||
|
||||
def CaseGroups(unicode_dir=_UNICODE_DIR):
|
||||
"""Returns list of Unicode code groups equivalent under case folding.
|
||||
|
||||
Each group is a sorted list of code points,
|
||||
and the list of groups is sorted by first code point
|
||||
in the group.
|
||||
|
||||
Args:
|
||||
unicode_dir: Unicode data directory
|
||||
|
||||
Returns:
|
||||
list of Unicode code groups
|
||||
"""
|
||||
|
||||
# Dict mapping lowercase code point to fold-equivalent group.
|
||||
togroup = {}
|
||||
|
||||
def DoLine(codes, fields):
|
||||
"""Process single CaseFolding.txt line, updating togroup."""
|
||||
(_, foldtype, lower, _) = fields
|
||||
if foldtype not in ("C", "S"):
|
||||
return
|
||||
lower = _UInt(lower)
|
||||
togroup.setdefault(lower, [lower]).extend(codes)
|
||||
|
||||
ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
|
||||
|
||||
groups = togroup.values()
|
||||
for g in groups:
|
||||
g.sort()
|
||||
groups.sort()
|
||||
return togroup, groups
|
||||
|
||||
|
||||
def Scripts(unicode_dir=_UNICODE_DIR):
|
||||
"""Returns dict mapping script names to code lists.
|
||||
|
||||
Args:
|
||||
unicode_dir: Unicode data directory
|
||||
|
||||
Returns:
|
||||
dict mapping script names to code lists
|
||||
"""
|
||||
|
||||
scripts = {}
|
||||
|
||||
def DoLine(codes, fields):
|
||||
"""Process single Scripts.txt line, updating scripts."""
|
||||
(_, name) = fields
|
||||
scripts.setdefault(name, []).extend(codes)
|
||||
|
||||
ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
|
||||
return scripts
|
||||
|
||||
|
||||
def Categories(unicode_dir=_UNICODE_DIR):
|
||||
"""Returns dict mapping category names to code lists.
|
||||
|
||||
Args:
|
||||
unicode_dir: Unicode data directory
|
||||
|
||||
Returns:
|
||||
dict mapping category names to code lists
|
||||
"""
|
||||
|
||||
categories = {}
|
||||
|
||||
def DoLine(codes, fields):
|
||||
"""Process single UnicodeData.txt line, updating categories."""
|
||||
category = fields[2]
|
||||
categories.setdefault(category, []).extend(codes)
|
||||
# Add codes from Lu into L, etc.
|
||||
if len(category) > 1:
|
||||
short = category[0]
|
||||
categories.setdefault(short, []).extend(codes)
|
||||
|
||||
ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
|
||||
return categories
|
||||
|
|
@ -0,0 +1,469 @@
|
|||
|
||||
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
||||
// make_unicode_casefold.py >unicode_casefold.cc
|
||||
|
||||
#include "re2/unicode_casefold.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
|
||||
// 1029 groups, 2079 pairs, 282 ranges
|
||||
CaseFold unicode_casefold[] = {
|
||||
{ 65, 90, 32 },
|
||||
{ 97, 106, -32 },
|
||||
{ 107, 107, 8383 },
|
||||
{ 108, 114, -32 },
|
||||
{ 115, 115, 268 },
|
||||
{ 116, 122, -32 },
|
||||
{ 181, 181, 743 },
|
||||
{ 192, 214, 32 },
|
||||
{ 216, 222, 32 },
|
||||
{ 223, 223, 7615 },
|
||||
{ 224, 228, -32 },
|
||||
{ 229, 229, 8262 },
|
||||
{ 230, 246, -32 },
|
||||
{ 248, 254, -32 },
|
||||
{ 255, 255, 121 },
|
||||
{ 256, 303, EvenOdd },
|
||||
{ 306, 311, EvenOdd },
|
||||
{ 313, 328, OddEven },
|
||||
{ 330, 375, EvenOdd },
|
||||
{ 376, 376, -121 },
|
||||
{ 377, 382, OddEven },
|
||||
{ 383, 383, -300 },
|
||||
{ 384, 384, 195 },
|
||||
{ 385, 385, 210 },
|
||||
{ 386, 389, EvenOdd },
|
||||
{ 390, 390, 206 },
|
||||
{ 391, 392, OddEven },
|
||||
{ 393, 394, 205 },
|
||||
{ 395, 396, OddEven },
|
||||
{ 398, 398, 79 },
|
||||
{ 399, 399, 202 },
|
||||
{ 400, 400, 203 },
|
||||
{ 401, 402, OddEven },
|
||||
{ 403, 403, 205 },
|
||||
{ 404, 404, 207 },
|
||||
{ 405, 405, 97 },
|
||||
{ 406, 406, 211 },
|
||||
{ 407, 407, 209 },
|
||||
{ 408, 409, EvenOdd },
|
||||
{ 410, 410, 163 },
|
||||
{ 412, 412, 211 },
|
||||
{ 413, 413, 213 },
|
||||
{ 414, 414, 130 },
|
||||
{ 415, 415, 214 },
|
||||
{ 416, 421, EvenOdd },
|
||||
{ 422, 422, 218 },
|
||||
{ 423, 424, OddEven },
|
||||
{ 425, 425, 218 },
|
||||
{ 428, 429, EvenOdd },
|
||||
{ 430, 430, 218 },
|
||||
{ 431, 432, OddEven },
|
||||
{ 433, 434, 217 },
|
||||
{ 435, 438, OddEven },
|
||||
{ 439, 439, 219 },
|
||||
{ 440, 441, EvenOdd },
|
||||
{ 444, 445, EvenOdd },
|
||||
{ 447, 447, 56 },
|
||||
{ 452, 452, EvenOdd },
|
||||
{ 453, 453, OddEven },
|
||||
{ 454, 454, -2 },
|
||||
{ 455, 455, OddEven },
|
||||
{ 456, 456, EvenOdd },
|
||||
{ 457, 457, -2 },
|
||||
{ 458, 458, EvenOdd },
|
||||
{ 459, 459, OddEven },
|
||||
{ 460, 460, -2 },
|
||||
{ 461, 476, OddEven },
|
||||
{ 477, 477, -79 },
|
||||
{ 478, 495, EvenOdd },
|
||||
{ 497, 497, OddEven },
|
||||
{ 498, 498, EvenOdd },
|
||||
{ 499, 499, -2 },
|
||||
{ 500, 501, EvenOdd },
|
||||
{ 502, 502, -97 },
|
||||
{ 503, 503, -56 },
|
||||
{ 504, 543, EvenOdd },
|
||||
{ 544, 544, -130 },
|
||||
{ 546, 563, EvenOdd },
|
||||
{ 570, 570, 10795 },
|
||||
{ 571, 572, OddEven },
|
||||
{ 573, 573, -163 },
|
||||
{ 574, 574, 10792 },
|
||||
{ 575, 576, 10815 },
|
||||
{ 577, 578, OddEven },
|
||||
{ 579, 579, -195 },
|
||||
{ 580, 580, 69 },
|
||||
{ 581, 581, 71 },
|
||||
{ 582, 591, EvenOdd },
|
||||
{ 592, 592, 10783 },
|
||||
{ 593, 593, 10780 },
|
||||
{ 594, 594, 10782 },
|
||||
{ 595, 595, -210 },
|
||||
{ 596, 596, -206 },
|
||||
{ 598, 599, -205 },
|
||||
{ 601, 601, -202 },
|
||||
{ 603, 603, -203 },
|
||||
{ 608, 608, -205 },
|
||||
{ 611, 611, -207 },
|
||||
{ 613, 613, 42280 },
|
||||
{ 616, 616, -209 },
|
||||
{ 617, 617, -211 },
|
||||
{ 619, 619, 10743 },
|
||||
{ 623, 623, -211 },
|
||||
{ 625, 625, 10749 },
|
||||
{ 626, 626, -213 },
|
||||
{ 629, 629, -214 },
|
||||
{ 637, 637, 10727 },
|
||||
{ 640, 640, -218 },
|
||||
{ 643, 643, -218 },
|
||||
{ 648, 648, -218 },
|
||||
{ 649, 649, -69 },
|
||||
{ 650, 651, -217 },
|
||||
{ 652, 652, -71 },
|
||||
{ 658, 658, -219 },
|
||||
{ 837, 837, 84 },
|
||||
{ 880, 883, EvenOdd },
|
||||
{ 886, 887, EvenOdd },
|
||||
{ 891, 893, 130 },
|
||||
{ 902, 902, 38 },
|
||||
{ 904, 906, 37 },
|
||||
{ 908, 908, 64 },
|
||||
{ 910, 911, 63 },
|
||||
{ 913, 929, 32 },
|
||||
{ 931, 931, 31 },
|
||||
{ 932, 939, 32 },
|
||||
{ 940, 940, -38 },
|
||||
{ 941, 943, -37 },
|
||||
{ 945, 945, -32 },
|
||||
{ 946, 946, 30 },
|
||||
{ 947, 948, -32 },
|
||||
{ 949, 949, 64 },
|
||||
{ 950, 951, -32 },
|
||||
{ 952, 952, 25 },
|
||||
{ 953, 953, 7173 },
|
||||
{ 954, 954, 54 },
|
||||
{ 955, 955, -32 },
|
||||
{ 956, 956, -775 },
|
||||
{ 957, 959, -32 },
|
||||
{ 960, 960, 22 },
|
||||
{ 961, 961, 48 },
|
||||
{ 962, 962, EvenOdd },
|
||||
{ 963, 965, -32 },
|
||||
{ 966, 966, 15 },
|
||||
{ 967, 968, -32 },
|
||||
{ 969, 969, 7517 },
|
||||
{ 970, 971, -32 },
|
||||
{ 972, 972, -64 },
|
||||
{ 973, 974, -63 },
|
||||
{ 975, 975, 8 },
|
||||
{ 976, 976, -62 },
|
||||
{ 977, 977, 35 },
|
||||
{ 981, 981, -47 },
|
||||
{ 982, 982, -54 },
|
||||
{ 983, 983, -8 },
|
||||
{ 984, 1007, EvenOdd },
|
||||
{ 1008, 1008, -86 },
|
||||
{ 1009, 1009, -80 },
|
||||
{ 1010, 1010, 7 },
|
||||
{ 1012, 1012, -92 },
|
||||
{ 1013, 1013, -96 },
|
||||
{ 1015, 1016, OddEven },
|
||||
{ 1017, 1017, -7 },
|
||||
{ 1018, 1019, EvenOdd },
|
||||
{ 1021, 1023, -130 },
|
||||
{ 1024, 1039, 80 },
|
||||
{ 1040, 1071, 32 },
|
||||
{ 1072, 1103, -32 },
|
||||
{ 1104, 1119, -80 },
|
||||
{ 1120, 1153, EvenOdd },
|
||||
{ 1162, 1215, EvenOdd },
|
||||
{ 1216, 1216, 15 },
|
||||
{ 1217, 1230, OddEven },
|
||||
{ 1231, 1231, -15 },
|
||||
{ 1232, 1319, EvenOdd },
|
||||
{ 1329, 1366, 48 },
|
||||
{ 1377, 1414, -48 },
|
||||
{ 4256, 4293, 7264 },
|
||||
{ 7545, 7545, 35332 },
|
||||
{ 7549, 7549, 3814 },
|
||||
{ 7680, 7776, EvenOdd },
|
||||
{ 7777, 7777, 58 },
|
||||
{ 7778, 7829, EvenOdd },
|
||||
{ 7835, 7835, -59 },
|
||||
{ 7838, 7838, -7615 },
|
||||
{ 7840, 7935, EvenOdd },
|
||||
{ 7936, 7943, 8 },
|
||||
{ 7944, 7951, -8 },
|
||||
{ 7952, 7957, 8 },
|
||||
{ 7960, 7965, -8 },
|
||||
{ 7968, 7975, 8 },
|
||||
{ 7976, 7983, -8 },
|
||||
{ 7984, 7991, 8 },
|
||||
{ 7992, 7999, -8 },
|
||||
{ 8000, 8005, 8 },
|
||||
{ 8008, 8013, -8 },
|
||||
{ 8017, 8017, 8 },
|
||||
{ 8019, 8019, 8 },
|
||||
{ 8021, 8021, 8 },
|
||||
{ 8023, 8023, 8 },
|
||||
{ 8025, 8025, -8 },
|
||||
{ 8027, 8027, -8 },
|
||||
{ 8029, 8029, -8 },
|
||||
{ 8031, 8031, -8 },
|
||||
{ 8032, 8039, 8 },
|
||||
{ 8040, 8047, -8 },
|
||||
{ 8048, 8049, 74 },
|
||||
{ 8050, 8053, 86 },
|
||||
{ 8054, 8055, 100 },
|
||||
{ 8056, 8057, 128 },
|
||||
{ 8058, 8059, 112 },
|
||||
{ 8060, 8061, 126 },
|
||||
{ 8064, 8071, 8 },
|
||||
{ 8072, 8079, -8 },
|
||||
{ 8080, 8087, 8 },
|
||||
{ 8088, 8095, -8 },
|
||||
{ 8096, 8103, 8 },
|
||||
{ 8104, 8111, -8 },
|
||||
{ 8112, 8113, 8 },
|
||||
{ 8115, 8115, 9 },
|
||||
{ 8120, 8121, -8 },
|
||||
{ 8122, 8123, -74 },
|
||||
{ 8124, 8124, -9 },
|
||||
{ 8126, 8126, -7289 },
|
||||
{ 8131, 8131, 9 },
|
||||
{ 8136, 8139, -86 },
|
||||
{ 8140, 8140, -9 },
|
||||
{ 8144, 8145, 8 },
|
||||
{ 8152, 8153, -8 },
|
||||
{ 8154, 8155, -100 },
|
||||
{ 8160, 8161, 8 },
|
||||
{ 8165, 8165, 7 },
|
||||
{ 8168, 8169, -8 },
|
||||
{ 8170, 8171, -112 },
|
||||
{ 8172, 8172, -7 },
|
||||
{ 8179, 8179, 9 },
|
||||
{ 8184, 8185, -128 },
|
||||
{ 8186, 8187, -126 },
|
||||
{ 8188, 8188, -9 },
|
||||
{ 8486, 8486, -7549 },
|
||||
{ 8490, 8490, -8415 },
|
||||
{ 8491, 8491, -8294 },
|
||||
{ 8498, 8498, 28 },
|
||||
{ 8526, 8526, -28 },
|
||||
{ 8544, 8559, 16 },
|
||||
{ 8560, 8575, -16 },
|
||||
{ 8579, 8580, OddEven },
|
||||
{ 9398, 9423, 26 },
|
||||
{ 9424, 9449, -26 },
|
||||
{ 11264, 11310, 48 },
|
||||
{ 11312, 11358, -48 },
|
||||
{ 11360, 11361, EvenOdd },
|
||||
{ 11362, 11362, -10743 },
|
||||
{ 11363, 11363, -3814 },
|
||||
{ 11364, 11364, -10727 },
|
||||
{ 11365, 11365, -10795 },
|
||||
{ 11366, 11366, -10792 },
|
||||
{ 11367, 11372, OddEven },
|
||||
{ 11373, 11373, -10780 },
|
||||
{ 11374, 11374, -10749 },
|
||||
{ 11375, 11375, -10783 },
|
||||
{ 11376, 11376, -10782 },
|
||||
{ 11378, 11379, EvenOdd },
|
||||
{ 11381, 11382, OddEven },
|
||||
{ 11390, 11391, -10815 },
|
||||
{ 11392, 11491, EvenOdd },
|
||||
{ 11499, 11502, OddEven },
|
||||
{ 11520, 11557, -7264 },
|
||||
{ 42560, 42605, EvenOdd },
|
||||
{ 42624, 42647, EvenOdd },
|
||||
{ 42786, 42799, EvenOdd },
|
||||
{ 42802, 42863, EvenOdd },
|
||||
{ 42873, 42876, OddEven },
|
||||
{ 42877, 42877, -35332 },
|
||||
{ 42878, 42887, EvenOdd },
|
||||
{ 42891, 42892, OddEven },
|
||||
{ 42893, 42893, -42280 },
|
||||
{ 42896, 42897, EvenOdd },
|
||||
{ 42912, 42921, EvenOdd },
|
||||
{ 65313, 65338, 32 },
|
||||
{ 65345, 65370, -32 },
|
||||
{ 66560, 66599, 40 },
|
||||
{ 66600, 66639, -40 },
|
||||
};
|
||||
int num_unicode_casefold = 282;
|
||||
|
||||
// 1029 groups, 1050 pairs, 163 ranges
|
||||
CaseFold unicode_tolower[] = {
|
||||
{ 65, 90, 32 },
|
||||
{ 181, 181, 775 },
|
||||
{ 192, 214, 32 },
|
||||
{ 216, 222, 32 },
|
||||
{ 256, 302, EvenOddSkip },
|
||||
{ 306, 310, EvenOddSkip },
|
||||
{ 313, 327, OddEvenSkip },
|
||||
{ 330, 374, EvenOddSkip },
|
||||
{ 376, 376, -121 },
|
||||
{ 377, 381, OddEvenSkip },
|
||||
{ 383, 383, -268 },
|
||||
{ 385, 385, 210 },
|
||||
{ 386, 388, EvenOddSkip },
|
||||
{ 390, 390, 206 },
|
||||
{ 391, 391, OddEven },
|
||||
{ 393, 394, 205 },
|
||||
{ 395, 395, OddEven },
|
||||
{ 398, 398, 79 },
|
||||
{ 399, 399, 202 },
|
||||
{ 400, 400, 203 },
|
||||
{ 401, 401, OddEven },
|
||||
{ 403, 403, 205 },
|
||||
{ 404, 404, 207 },
|
||||
{ 406, 406, 211 },
|
||||
{ 407, 407, 209 },
|
||||
{ 408, 408, EvenOdd },
|
||||
{ 412, 412, 211 },
|
||||
{ 413, 413, 213 },
|
||||
{ 415, 415, 214 },
|
||||
{ 416, 420, EvenOddSkip },
|
||||
{ 422, 422, 218 },
|
||||
{ 423, 423, OddEven },
|
||||
{ 425, 425, 218 },
|
||||
{ 428, 428, EvenOdd },
|
||||
{ 430, 430, 218 },
|
||||
{ 431, 431, OddEven },
|
||||
{ 433, 434, 217 },
|
||||
{ 435, 437, OddEvenSkip },
|
||||
{ 439, 439, 219 },
|
||||
{ 440, 440, EvenOdd },
|
||||
{ 444, 444, EvenOdd },
|
||||
{ 452, 452, 2 },
|
||||
{ 453, 453, OddEven },
|
||||
{ 455, 455, 2 },
|
||||
{ 456, 456, EvenOdd },
|
||||
{ 458, 458, 2 },
|
||||
{ 459, 475, OddEvenSkip },
|
||||
{ 478, 494, EvenOddSkip },
|
||||
{ 497, 497, 2 },
|
||||
{ 498, 500, EvenOddSkip },
|
||||
{ 502, 502, -97 },
|
||||
{ 503, 503, -56 },
|
||||
{ 504, 542, EvenOddSkip },
|
||||
{ 544, 544, -130 },
|
||||
{ 546, 562, EvenOddSkip },
|
||||
{ 570, 570, 10795 },
|
||||
{ 571, 571, OddEven },
|
||||
{ 573, 573, -163 },
|
||||
{ 574, 574, 10792 },
|
||||
{ 577, 577, OddEven },
|
||||
{ 579, 579, -195 },
|
||||
{ 580, 580, 69 },
|
||||
{ 581, 581, 71 },
|
||||
{ 582, 590, EvenOddSkip },
|
||||
{ 837, 837, 116 },
|
||||
{ 880, 882, EvenOddSkip },
|
||||
{ 886, 886, EvenOdd },
|
||||
{ 902, 902, 38 },
|
||||
{ 904, 906, 37 },
|
||||
{ 908, 908, 64 },
|
||||
{ 910, 911, 63 },
|
||||
{ 913, 929, 32 },
|
||||
{ 931, 939, 32 },
|
||||
{ 962, 962, EvenOdd },
|
||||
{ 975, 975, 8 },
|
||||
{ 976, 976, -30 },
|
||||
{ 977, 977, -25 },
|
||||
{ 981, 981, -15 },
|
||||
{ 982, 982, -22 },
|
||||
{ 984, 1006, EvenOddSkip },
|
||||
{ 1008, 1008, -54 },
|
||||
{ 1009, 1009, -48 },
|
||||
{ 1012, 1012, -60 },
|
||||
{ 1013, 1013, -64 },
|
||||
{ 1015, 1015, OddEven },
|
||||
{ 1017, 1017, -7 },
|
||||
{ 1018, 1018, EvenOdd },
|
||||
{ 1021, 1023, -130 },
|
||||
{ 1024, 1039, 80 },
|
||||
{ 1040, 1071, 32 },
|
||||
{ 1120, 1152, EvenOddSkip },
|
||||
{ 1162, 1214, EvenOddSkip },
|
||||
{ 1216, 1216, 15 },
|
||||
{ 1217, 1229, OddEvenSkip },
|
||||
{ 1232, 1318, EvenOddSkip },
|
||||
{ 1329, 1366, 48 },
|
||||
{ 4256, 4293, 7264 },
|
||||
{ 7680, 7828, EvenOddSkip },
|
||||
{ 7835, 7835, -58 },
|
||||
{ 7838, 7838, -7615 },
|
||||
{ 7840, 7934, EvenOddSkip },
|
||||
{ 7944, 7951, -8 },
|
||||
{ 7960, 7965, -8 },
|
||||
{ 7976, 7983, -8 },
|
||||
{ 7992, 7999, -8 },
|
||||
{ 8008, 8013, -8 },
|
||||
{ 8025, 8025, -8 },
|
||||
{ 8027, 8027, -8 },
|
||||
{ 8029, 8029, -8 },
|
||||
{ 8031, 8031, -8 },
|
||||
{ 8040, 8047, -8 },
|
||||
{ 8072, 8079, -8 },
|
||||
{ 8088, 8095, -8 },
|
||||
{ 8104, 8111, -8 },
|
||||
{ 8120, 8121, -8 },
|
||||
{ 8122, 8123, -74 },
|
||||
{ 8124, 8124, -9 },
|
||||
{ 8126, 8126, -7173 },
|
||||
{ 8136, 8139, -86 },
|
||||
{ 8140, 8140, -9 },
|
||||
{ 8152, 8153, -8 },
|
||||
{ 8154, 8155, -100 },
|
||||
{ 8168, 8169, -8 },
|
||||
{ 8170, 8171, -112 },
|
||||
{ 8172, 8172, -7 },
|
||||
{ 8184, 8185, -128 },
|
||||
{ 8186, 8187, -126 },
|
||||
{ 8188, 8188, -9 },
|
||||
{ 8486, 8486, -7517 },
|
||||
{ 8490, 8490, -8383 },
|
||||
{ 8491, 8491, -8262 },
|
||||
{ 8498, 8498, 28 },
|
||||
{ 8544, 8559, 16 },
|
||||
{ 8579, 8579, OddEven },
|
||||
{ 9398, 9423, 26 },
|
||||
{ 11264, 11310, 48 },
|
||||
{ 11360, 11360, EvenOdd },
|
||||
{ 11362, 11362, -10743 },
|
||||
{ 11363, 11363, -3814 },
|
||||
{ 11364, 11364, -10727 },
|
||||
{ 11367, 11371, OddEvenSkip },
|
||||
{ 11373, 11373, -10780 },
|
||||
{ 11374, 11374, -10749 },
|
||||
{ 11375, 11375, -10783 },
|
||||
{ 11376, 11376, -10782 },
|
||||
{ 11378, 11378, EvenOdd },
|
||||
{ 11381, 11381, OddEven },
|
||||
{ 11390, 11391, -10815 },
|
||||
{ 11392, 11490, EvenOddSkip },
|
||||
{ 11499, 11501, OddEvenSkip },
|
||||
{ 42560, 42604, EvenOddSkip },
|
||||
{ 42624, 42646, EvenOddSkip },
|
||||
{ 42786, 42798, EvenOddSkip },
|
||||
{ 42802, 42862, EvenOddSkip },
|
||||
{ 42873, 42875, OddEvenSkip },
|
||||
{ 42877, 42877, -35332 },
|
||||
{ 42878, 42886, EvenOddSkip },
|
||||
{ 42891, 42891, OddEven },
|
||||
{ 42893, 42893, -42280 },
|
||||
{ 42896, 42896, EvenOdd },
|
||||
{ 42912, 42920, EvenOddSkip },
|
||||
{ 65313, 65338, 32 },
|
||||
{ 66560, 66599, 40 },
|
||||
};
|
||||
int num_unicode_tolower = 163;
|
||||
|
||||
|
||||
|
||||
} // namespace re2
|
||||
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Unicode case folding tables.
|
||||
|
||||
// The Unicode case folding tables encode the mapping from one Unicode point
|
||||
// to the next largest Unicode point with equivalent folding. The largest
|
||||
// point wraps back to the first. For example, the tables map:
|
||||
//
|
||||
// 'A' -> 'a'
|
||||
// 'a' -> 'A'
|
||||
//
|
||||
// 'K' -> 'k'
|
||||
// 'k' -> 'K' (Kelvin symbol)
|
||||
// 'K' -> 'K'
|
||||
//
|
||||
// Like everything Unicode, these tables are big. If we represent the table
|
||||
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
|
||||
// Most table entries look like the ones around them:
|
||||
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
|
||||
// Instead of listing all the pairs explicitly, we make a list of ranges
|
||||
// and deltas, so that the table entries for 'A' through 'Z' can be represented
|
||||
// as a single entry { 'A', 'Z', +32 }.
|
||||
//
|
||||
// In addition to blocks that map to each other (A-Z mapping to a-z)
|
||||
// there are blocks of pairs that individually map to each other
|
||||
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
|
||||
// For those, the special delta value EvenOdd marks even/odd pairs
|
||||
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
|
||||
//
|
||||
// In this form, the table has 274 entries, about 3kB. If we were to split
|
||||
// the table into one for 16-bit codes and an overflow table for larger ones,
|
||||
// we could get it down to about 1.5kB, but that's not worth the complexity.
|
||||
//
|
||||
// The grouped form also allows for efficient fold range calculations
|
||||
// rather than looping one character at a time.
|
||||
|
||||
#ifndef RE2_UNICODE_CASEFOLD_H__
|
||||
#define RE2_UNICODE_CASEFOLD_H__
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
enum {
|
||||
EvenOdd = 1,
|
||||
OddEven = -1,
|
||||
EvenOddSkip = 1<<30,
|
||||
OddEvenSkip,
|
||||
};
|
||||
|
||||
struct CaseFold {
|
||||
uint32 lo;
|
||||
uint32 hi;
|
||||
int32 delta;
|
||||
};
|
||||
|
||||
extern CaseFold unicode_casefold[];
|
||||
extern int num_unicode_casefold;
|
||||
|
||||
extern CaseFold unicode_tolower[];
|
||||
extern int num_unicode_tolower;
|
||||
|
||||
// Returns the CaseFold* in the tables that contains rune.
|
||||
// If rune is not in the tables, returns the first CaseFold* after rune.
|
||||
// If rune is larger than any value in the tables, returns NULL.
|
||||
extern CaseFold* LookupCaseFold(CaseFold*, int, Rune rune);
|
||||
|
||||
// Returns the result of applying the fold f to the rune r.
|
||||
extern Rune ApplyFold(CaseFold *f, Rune r);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UNICODE_CASEFOLD_H__
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,64 @@
|
|||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Unicode character groups.
|
||||
|
||||
// The codes get split into ranges of 16-bit codes
|
||||
// and ranges of 32-bit codes. It would be simpler
|
||||
// to use only 32-bit ranges, but these tables are large
|
||||
// enough to warrant extra care.
|
||||
//
|
||||
// Using just 32-bit ranges gives 27 kB of data.
|
||||
// Adding 16-bit ranges gives 18 kB of data.
|
||||
// Adding an extra table of 16-bit singletons would reduce
|
||||
// to 16.5 kB of data but make the data harder to use;
|
||||
// we don't bother.
|
||||
|
||||
#ifndef RE2_UNICODE_GROUPS_H__
|
||||
#define RE2_UNICODE_GROUPS_H__
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct URange16
|
||||
{
|
||||
uint16 lo;
|
||||
uint16 hi;
|
||||
};
|
||||
|
||||
struct URange32
|
||||
{
|
||||
uint32 lo;
|
||||
uint32 hi;
|
||||
};
|
||||
|
||||
struct UGroup
|
||||
{
|
||||
const char *name;
|
||||
int sign; // +1 for [abc], -1 for [^abc]
|
||||
URange16 *r16;
|
||||
int nr16;
|
||||
URange32 *r32;
|
||||
int nr32;
|
||||
};
|
||||
|
||||
// Named by property or script name (e.g., "Nd", "N", "Han").
|
||||
// Negated groups are not included.
|
||||
extern UGroup unicode_groups[];
|
||||
extern int num_unicode_groups;
|
||||
|
||||
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
|
||||
// Negated groups are included.
|
||||
extern UGroup posix_groups[];
|
||||
extern int num_posix_groups;
|
||||
|
||||
// Named by Perl name (e.g., "\\d", "\\D").
|
||||
// Negated groups are included.
|
||||
extern UGroup perl_groups[];
|
||||
extern int num_perl_groups;
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UNICODE_GROUPS_H__
|
|
@ -0,0 +1,346 @@
|
|||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_VARIADIC_FUNCTION_H_
|
||||
#define RE2_VARIADIC_FUNCTION_H_
|
||||
|
||||
namespace re2 {
|
||||
|
||||
template <typename Result, typename Param0, typename Param1, typename Arg,
|
||||
Result (*Func)(Param0, Param1, const Arg* const [], int count)>
|
||||
class VariadicFunction2 {
|
||||
public:
|
||||
VariadicFunction2() {}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1) const {
|
||||
return Func(p0, p1, 0, 0);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
|
||||
const Arg* const args[] = { &a0 };
|
||||
return Func(p0, p1, args, 1);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
|
||||
const Arg* const args[] = { &a0, &a1 };
|
||||
return Func(p0, p1, args, 2);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2 };
|
||||
return Func(p0, p1, args, 3);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3 };
|
||||
return Func(p0, p1, args, 4);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
|
||||
return Func(p0, p1, args, 5);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
|
||||
return Func(p0, p1, args, 6);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
|
||||
return Func(p0, p1, args, 7);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
|
||||
return Func(p0, p1, args, 8);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
|
||||
return Func(p0, p1, args, 9);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9 };
|
||||
return Func(p0, p1, args, 10);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10 };
|
||||
return Func(p0, p1, args, 11);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11 };
|
||||
return Func(p0, p1, args, 12);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12 };
|
||||
return Func(p0, p1, args, 13);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13 };
|
||||
return Func(p0, p1, args, 14);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14 };
|
||||
return Func(p0, p1, args, 15);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15 };
|
||||
return Func(p0, p1, args, 16);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
|
||||
return Func(p0, p1, args, 17);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
|
||||
return Func(p0, p1, args, 18);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
|
||||
return Func(p0, p1, args, 19);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
|
||||
return Func(p0, p1, args, 20);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
|
||||
&a20 };
|
||||
return Func(p0, p1, args, 21);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21 };
|
||||
return Func(p0, p1, args, 22);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22 };
|
||||
return Func(p0, p1, args, 23);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23 };
|
||||
return Func(p0, p1, args, 24);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24 };
|
||||
return Func(p0, p1, args, 25);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25 };
|
||||
return Func(p0, p1, args, 26);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
||||
const Arg& a26) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25, &a26 };
|
||||
return Func(p0, p1, args, 27);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
||||
const Arg& a26, const Arg& a27) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27 };
|
||||
return Func(p0, p1, args, 28);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
||||
const Arg& a26, const Arg& a27, const Arg& a28) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
|
||||
return Func(p0, p1, args, 29);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
||||
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
|
||||
return Func(p0, p1, args, 30);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
||||
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
|
||||
const Arg& a30) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
|
||||
return Func(p0, p1, args, 31);
|
||||
}
|
||||
|
||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
||||
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
|
||||
const Arg& a30, const Arg& a31) const {
|
||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
|
||||
return Func(p0, p1, args, 32);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_VARIADIC_FUNCTION_H_
|
|
@ -0,0 +1,244 @@
|
|||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Helper class for traversing Regexps without recursion.
|
||||
// Clients should declare their own subclasses that override
|
||||
// the PreVisit and PostVisit methods, which are called before
|
||||
// and after visiting the subexpressions.
|
||||
|
||||
// Not quite the Visitor pattern, because (among other things)
|
||||
// the Visitor pattern is recursive.
|
||||
|
||||
#ifndef RE2_WALKER_INL_H__
|
||||
#define RE2_WALKER_INL_H__
|
||||
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
template<typename T> struct WalkState;
|
||||
|
||||
template<typename T> class Regexp::Walker {
|
||||
public:
|
||||
Walker();
|
||||
virtual ~Walker();
|
||||
|
||||
// Virtual method called before visiting re's children.
|
||||
// PreVisit passes ownership of its return value to its caller.
|
||||
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
|
||||
// and passed to the child PreVisits and PostVisits as parent_arg.
|
||||
// At the top-most Regexp, parent_arg is arg passed to walk.
|
||||
// If PreVisit sets *stop to true, the walk does not recurse
|
||||
// into the children. Instead it behaves as though the return
|
||||
// value from PreVisit is the return value from PostVisit.
|
||||
// The default PreVisit returns parent_arg.
|
||||
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
|
||||
|
||||
// Virtual method called after visiting re's children.
|
||||
// The pre_arg is the T that PreVisit returned.
|
||||
// The child_args is a vector of the T that the child PostVisits returned.
|
||||
// PostVisit takes ownership of pre_arg.
|
||||
// PostVisit takes ownership of the Ts
|
||||
// in *child_args, but not the vector itself.
|
||||
// PostVisit passes ownership of its return value
|
||||
// to its caller.
|
||||
// The default PostVisit simply returns pre_arg.
|
||||
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
|
||||
T* child_args, int nchild_args);
|
||||
|
||||
// Virtual method called to copy a T,
|
||||
// when Walk notices that more than one child is the same re.
|
||||
virtual T Copy(T arg);
|
||||
|
||||
// Virtual method called to do a "quick visit" of the re,
|
||||
// but not its children. Only called once the visit budget
|
||||
// has been used up and we're trying to abort the walk
|
||||
// as quickly as possible. Should return a value that
|
||||
// makes sense for the parent PostVisits still to be run.
|
||||
// This function is (hopefully) only called by
|
||||
// WalkExponential, but must be implemented by all clients,
|
||||
// just in case.
|
||||
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
|
||||
|
||||
// Walks over a regular expression.
|
||||
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
|
||||
// Returns the T returned by PostVisit on re.
|
||||
T Walk(Regexp* re, T top_arg);
|
||||
|
||||
// Like Walk, but doesn't use Copy. This can lead to
|
||||
// exponential runtimes on cross-linked Regexps like the
|
||||
// ones generated by Simplify. To help limit this,
|
||||
// at most max_visits nodes will be visited and then
|
||||
// the walk will be cut off early.
|
||||
// If the walk *is* cut off early, ShortVisit(re)
|
||||
// will be called on regexps that cannot be fully
|
||||
// visited rather than calling PreVisit/PostVisit.
|
||||
T WalkExponential(Regexp* re, T top_arg, int max_visits);
|
||||
|
||||
// Clears the stack. Should never be necessary, since
|
||||
// Walk always enters and exits with an empty stack.
|
||||
// Logs DFATAL if stack is not already clear.
|
||||
void Reset();
|
||||
|
||||
// Returns whether walk was cut off.
|
||||
bool stopped_early() { return stopped_early_; }
|
||||
|
||||
private:
|
||||
// Walk state for the entire traversal.
|
||||
stack<WalkState<T> >* stack_;
|
||||
bool stopped_early_;
|
||||
int max_visits_;
|
||||
|
||||
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Walker);
|
||||
};
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
|
||||
T parent_arg,
|
||||
bool* stop) {
|
||||
return parent_arg;
|
||||
}
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
|
||||
T parent_arg,
|
||||
T pre_arg,
|
||||
T* child_args,
|
||||
int nchild_args) {
|
||||
return pre_arg;
|
||||
}
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
|
||||
return arg;
|
||||
}
|
||||
|
||||
// State about a single level in the traversal.
|
||||
template<typename T> struct WalkState {
|
||||
WalkState<T>(Regexp* re, T parent)
|
||||
: re(re),
|
||||
n(-1),
|
||||
parent_arg(parent),
|
||||
child_args(NULL) { }
|
||||
|
||||
Regexp* re; // The regexp
|
||||
int n; // The index of the next child to process; -1 means need to PreVisit
|
||||
T parent_arg; // Accumulated arguments.
|
||||
T pre_arg;
|
||||
T child_arg; // One-element buffer for child_args.
|
||||
T* child_args;
|
||||
};
|
||||
|
||||
template<typename T> Regexp::Walker<T>::Walker() {
|
||||
stack_ = new stack<WalkState<T> >;
|
||||
stopped_early_ = false;
|
||||
}
|
||||
|
||||
template<typename T> Regexp::Walker<T>::~Walker() {
|
||||
Reset();
|
||||
delete stack_;
|
||||
}
|
||||
|
||||
// Clears the stack. Should never be necessary, since
|
||||
// Walk always enters and exits with an empty stack.
|
||||
// Logs DFATAL if stack is not already clear.
|
||||
template<typename T> void Regexp::Walker<T>::Reset() {
|
||||
if (stack_ && stack_->size() > 0) {
|
||||
LOG(DFATAL) << "Stack not empty.";
|
||||
while (stack_->size() > 0) {
|
||||
delete stack_->top().child_args;
|
||||
stack_->pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
|
||||
bool use_copy) {
|
||||
Reset();
|
||||
|
||||
if (re == NULL) {
|
||||
LOG(DFATAL) << "Walk NULL";
|
||||
return top_arg;
|
||||
}
|
||||
|
||||
stack_->push(WalkState<T>(re, top_arg));
|
||||
|
||||
WalkState<T>* s;
|
||||
for (;;) {
|
||||
T t;
|
||||
s = &stack_->top();
|
||||
Regexp* re = s->re;
|
||||
switch (s->n) {
|
||||
case -1: {
|
||||
if (--max_visits_ < 0) {
|
||||
stopped_early_ = true;
|
||||
t = ShortVisit(re, s->parent_arg);
|
||||
break;
|
||||
}
|
||||
bool stop = false;
|
||||
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
|
||||
if (stop) {
|
||||
t = s->pre_arg;
|
||||
break;
|
||||
}
|
||||
s->n = 0;
|
||||
s->child_args = NULL;
|
||||
if (re->nsub_ == 1)
|
||||
s->child_args = &s->child_arg;
|
||||
else if (re->nsub_ > 1)
|
||||
s->child_args = new T[re->nsub_];
|
||||
// Fall through.
|
||||
}
|
||||
default: {
|
||||
if (re->nsub_ > 0) {
|
||||
Regexp** sub = re->sub();
|
||||
if (s->n < re->nsub_) {
|
||||
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
|
||||
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
|
||||
s->n++;
|
||||
} else {
|
||||
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
|
||||
if (re->nsub_ > 1)
|
||||
delete[] s->child_args;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We've finished stack_->top().
|
||||
// Update next guy down.
|
||||
stack_->pop();
|
||||
if (stack_->size() == 0)
|
||||
return t;
|
||||
s = &stack_->top();
|
||||
if (s->child_args != NULL)
|
||||
s->child_args[s->n] = t;
|
||||
else
|
||||
s->child_arg = t;
|
||||
s->n++;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
|
||||
// Without the exponential walking behavior,
|
||||
// this budget should be more than enough for any
|
||||
// regexp, and yet not enough to get us in trouble
|
||||
// as far as CPU time.
|
||||
max_visits_ = 1000000;
|
||||
return WalkInternal(re, top_arg, true);
|
||||
}
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
|
||||
int max_visits) {
|
||||
max_visits_ = max_visits;
|
||||
return WalkInternal(re, top_arg, false);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_WALKER_INL_H__
|
|
@ -0,0 +1,21 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
success=true
|
||||
for i
|
||||
do
|
||||
printf "%-40s" $i
|
||||
if sh -c "$i >$i.log 2>&1" 2>/dev/null
|
||||
then
|
||||
echo PASS
|
||||
else
|
||||
echo FAIL';' output in $i.log
|
||||
success=false
|
||||
fi
|
||||
done
|
||||
|
||||
if $success; then
|
||||
echo 'ALL TESTS PASSED.'
|
||||
exit 0
|
||||
fi
|
||||
echo 'TESTS FAILED.'
|
||||
exit 1
|
|
@ -0,0 +1,13 @@
|
|||
#include <re2/re2.h>
|
||||
#include <stdio.h>
|
||||
|
||||
using namespace re2;
|
||||
|
||||
int main(void) {
|
||||
if(RE2::FullMatch("axbyc", "a.*b.*c")) {
|
||||
printf("PASS\n");
|
||||
return 0;
|
||||
}
|
||||
printf("FAIL\n");
|
||||
return 2;
|
||||
}
|
|
@ -0,0 +1,168 @@
|
|||
// Copyright 2000 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::UnsafeArena()
|
||||
// UnsafeArena::~UnsafeArena()
|
||||
// Destroying the arena automatically calls Reset()
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
|
||||
UnsafeArena::UnsafeArena(const size_t block_size)
|
||||
: block_size_(block_size),
|
||||
freestart_(NULL), // set for real in Reset()
|
||||
last_alloc_(NULL),
|
||||
remaining_(0),
|
||||
blocks_alloced_(1),
|
||||
overflow_blocks_(NULL) {
|
||||
assert(block_size > kDefaultAlignment);
|
||||
|
||||
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
|
||||
first_blocks_[0].size = block_size_;
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
UnsafeArena::~UnsafeArena() {
|
||||
FreeBlocks();
|
||||
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
|
||||
// The first X blocks stay allocated always by default. Delete them now.
|
||||
for (int i = 0; i < blocks_alloced_; i++)
|
||||
free(first_blocks_[i].mem);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::Reset()
|
||||
// Clears all the memory an arena is using.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
void UnsafeArena::Reset() {
|
||||
FreeBlocks();
|
||||
freestart_ = first_blocks_[0].mem;
|
||||
remaining_ = first_blocks_[0].size;
|
||||
last_alloc_ = NULL;
|
||||
|
||||
// We do not know for sure whether or not the first block is aligned,
|
||||
// so we fix that right now.
|
||||
const int overage = reinterpret_cast<uintptr_t>(freestart_) &
|
||||
(kDefaultAlignment-1);
|
||||
if (overage > 0) {
|
||||
const int waste = kDefaultAlignment - overage;
|
||||
freestart_ += waste;
|
||||
remaining_ -= waste;
|
||||
}
|
||||
freestart_when_empty_ = freestart_;
|
||||
assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// UnsafeArena::AllocNewBlock()
|
||||
// Adds and returns an AllocatedBlock.
|
||||
// The returned AllocatedBlock* is valid until the next call
|
||||
// to AllocNewBlock or Reset. (i.e. anything that might
|
||||
// affect overflow_blocks_).
|
||||
// -------------------------------------------------------------
|
||||
|
||||
UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
|
||||
AllocatedBlock *block;
|
||||
// Find the next block.
|
||||
if ( blocks_alloced_ < arraysize(first_blocks_) ) {
|
||||
// Use one of the pre-allocated blocks
|
||||
block = &first_blocks_[blocks_alloced_++];
|
||||
} else { // oops, out of space, move to the vector
|
||||
if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
|
||||
// Adds another block to the vector.
|
||||
overflow_blocks_->resize(overflow_blocks_->size()+1);
|
||||
// block points to the last block of the vector.
|
||||
block = &overflow_blocks_->back();
|
||||
}
|
||||
|
||||
block->mem = reinterpret_cast<char*>(malloc(block_size));
|
||||
block->size = block_size;
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::GetMemoryFallback()
|
||||
// We take memory out of our pool, aligned on the byte boundary
|
||||
// requested. If we don't have space in our current pool, we
|
||||
// allocate a new block (wasting the remaining space in the
|
||||
// current block) and give you that. If your memory needs are
|
||||
// too big for a single block, we make a special your-memory-only
|
||||
// allocation -- this is equivalent to not using the arena at all.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
|
||||
if (size == 0)
|
||||
return NULL; // stl/stl_alloc.h says this is okay
|
||||
|
||||
assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2
|
||||
|
||||
// If the object is more than a quarter of the block size, allocate
|
||||
// it separately to avoid wasting too much space in leftover bytes
|
||||
if (block_size_ == 0 || size > block_size_/4) {
|
||||
// then it gets its own block in the arena
|
||||
assert(align <= kDefaultAlignment); // because that's what new gives us
|
||||
// This block stays separate from the rest of the world; in particular
|
||||
// we don't update last_alloc_ so you can't reclaim space on this block.
|
||||
return AllocNewBlock(size)->mem;
|
||||
}
|
||||
|
||||
const int overage =
|
||||
(reinterpret_cast<uintptr_t>(freestart_) & (align-1));
|
||||
if (overage) {
|
||||
const int waste = align - overage;
|
||||
freestart_ += waste;
|
||||
if (waste < remaining_) {
|
||||
remaining_ -= waste;
|
||||
} else {
|
||||
remaining_ = 0;
|
||||
}
|
||||
}
|
||||
if (size > remaining_) {
|
||||
AllocatedBlock *block = AllocNewBlock(block_size_);
|
||||
freestart_ = block->mem;
|
||||
remaining_ = block->size;
|
||||
}
|
||||
remaining_ -= size;
|
||||
last_alloc_ = freestart_;
|
||||
freestart_ += size;
|
||||
assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
|
||||
return reinterpret_cast<void*>(last_alloc_);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::FreeBlocks()
|
||||
// Unlike GetMemory(), which does actual work, ReturnMemory() is a
|
||||
// no-op: we don't "free" memory until Reset() is called. We do
|
||||
// update some stats, though. Note we do no checking that the
|
||||
// pointer you pass in was actually allocated by us, or that it
|
||||
// was allocated for the size you say, so be careful here!
|
||||
// FreeBlocks() does the work for Reset(), actually freeing all
|
||||
// memory allocated in one fell swoop.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
void UnsafeArena::FreeBlocks() {
|
||||
for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced
|
||||
free(first_blocks_[i].mem);
|
||||
first_blocks_[i].mem = NULL;
|
||||
first_blocks_[i].size = 0;
|
||||
}
|
||||
blocks_alloced_ = 1;
|
||||
if (overflow_blocks_ != NULL) {
|
||||
vector<AllocatedBlock>::iterator it;
|
||||
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
|
||||
free(it->mem);
|
||||
}
|
||||
delete overflow_blocks_; // These should be used very rarely
|
||||
overflow_blocks_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,103 @@
|
|||
// Copyright 2000 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Sometimes it is necessary to allocate a large number of small
|
||||
// objects. Doing this the usual way (malloc, new) is slow,
|
||||
// especially for multithreaded programs. An UnsafeArena provides a
|
||||
// mark/release method of memory management: it asks for a large chunk
|
||||
// from the operating system and doles it out bit by bit as required.
|
||||
// Then you free all the memory at once by calling UnsafeArena::Reset().
|
||||
// The "Unsafe" refers to the fact that UnsafeArena is not safe to
|
||||
// call from multiple threads.
|
||||
//
|
||||
// The global operator new that can be used as follows:
|
||||
//
|
||||
// #include "lib/arena-inl.h"
|
||||
//
|
||||
// UnsafeArena arena(1000);
|
||||
// Foo* foo = new (AllocateInArena, &arena) Foo;
|
||||
//
|
||||
|
||||
#ifndef RE2_UTIL_ARENA_H_
|
||||
#define RE2_UTIL_ARENA_H_
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// This class is thread-compatible.
|
||||
class UnsafeArena {
|
||||
public:
|
||||
UnsafeArena(const size_t block_size);
|
||||
virtual ~UnsafeArena();
|
||||
|
||||
void Reset();
|
||||
|
||||
// This should be the worst-case alignment for any type. This is
|
||||
// good for IA-32, SPARC version 7 (the last one I know), and
|
||||
// supposedly Alpha. i386 would be more time-efficient with a
|
||||
// default alignment of 8, but ::operator new() uses alignment of 4,
|
||||
// and an assertion will fail below after the call to MakeNewBlock()
|
||||
// if you try to use a larger alignment.
|
||||
#ifdef __i386__
|
||||
static const int kDefaultAlignment = 4;
|
||||
#else
|
||||
static const int kDefaultAlignment = 8;
|
||||
#endif
|
||||
|
||||
private:
|
||||
void* GetMemoryFallback(const size_t size, const int align);
|
||||
|
||||
public:
|
||||
void* GetMemory(const size_t size, const int align) {
|
||||
if ( size > 0 && size < remaining_ && align == 1 ) { // common case
|
||||
last_alloc_ = freestart_;
|
||||
freestart_ += size;
|
||||
remaining_ -= size;
|
||||
return reinterpret_cast<void*>(last_alloc_);
|
||||
}
|
||||
return GetMemoryFallback(size, align);
|
||||
}
|
||||
|
||||
private:
|
||||
struct AllocatedBlock {
|
||||
char *mem;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// The returned AllocatedBlock* is valid until the next call to AllocNewBlock
|
||||
// or Reset (i.e. anything that might affect overflow_blocks_).
|
||||
AllocatedBlock *AllocNewBlock(const size_t block_size);
|
||||
|
||||
const AllocatedBlock *IndexToBlock(int index) const;
|
||||
|
||||
const size_t block_size_;
|
||||
char* freestart_; // beginning of the free space in most recent block
|
||||
char* freestart_when_empty_; // beginning of the free space when we're empty
|
||||
char* last_alloc_; // used to make sure ReturnBytes() is safe
|
||||
size_t remaining_;
|
||||
// STL vector isn't as efficient as it could be, so we use an array at first
|
||||
int blocks_alloced_; // how many of the first_blocks_ have been alloced
|
||||
AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
|
||||
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
|
||||
vector<AllocatedBlock>* overflow_blocks_;
|
||||
|
||||
void FreeBlocks(); // Frees all except first block
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
|
||||
};
|
||||
|
||||
// Operators for allocation on the arena
|
||||
// Syntax: new (AllocateInArena, arena) MyClass;
|
||||
// STL containers, etc.
|
||||
enum AllocateInArenaType { AllocateInArena };
|
||||
|
||||
} // namespace re2
|
||||
|
||||
inline void* operator new(size_t size,
|
||||
re2::AllocateInArenaType /* unused */,
|
||||
re2::UnsafeArena *arena) {
|
||||
return reinterpret_cast<char*>(arena->GetMemory(size, 1));
|
||||
}
|
||||
|
||||
#endif // RE2_UTIL_ARENA_H_
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_ATOMICOPS_H__
|
||||
#define RE2_UTIL_ATOMICOPS_H__
|
||||
|
||||
#if defined(__i386__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
int x;
|
||||
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
|
||||
:: "r" (&x));
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__)
|
||||
|
||||
// 64-bit implementations of memory barrier can be simpler, because
|
||||
// "sfence" is guaranteed to exist.
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("sfence" : : : "memory");
|
||||
}
|
||||
|
||||
#elif defined(__ppc__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("eieio" : : : "memory");
|
||||
}
|
||||
|
||||
#elif defined(__alpha__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("wmb" : : : "memory");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "util/mutex.h"
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
// Slight overkill, but good enough:
|
||||
// any mutex implementation must have
|
||||
// a read barrier after the lock operation and
|
||||
// a write barrier before the unlock operation.
|
||||
//
|
||||
// It may be worthwhile to write architecture-specific
|
||||
// barriers for the common platforms, as above, but
|
||||
// this is a correct fallback.
|
||||
re2::Mutex mu;
|
||||
re2::MutexLock l(&mu);
|
||||
}
|
||||
|
||||
/*
|
||||
#error Need WriteMemoryBarrier for architecture.
|
||||
|
||||
// Windows
|
||||
inline void WriteMemoryBarrier() {
|
||||
LONG x;
|
||||
::InterlockedExchange(&x, 0);
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
// Alpha has very weak memory ordering. If relying on WriteBarriers, must one
|
||||
// use read barriers for the readers too.
|
||||
#if defined(__alpha__)
|
||||
|
||||
static inline void MaybeReadMemoryBarrier() {
|
||||
__asm__ __volatile__("mb" : : : "memory");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void MaybeReadMemoryBarrier() {}
|
||||
|
||||
#endif // __alpha__
|
||||
|
||||
#endif // RE2_UTIL_ATOMICOPS_H__
|
|
@ -0,0 +1,153 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "util/benchmark.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
||||
|
||||
using testing::Benchmark;
|
||||
using namespace re2;
|
||||
|
||||
static Benchmark* benchmarks[10000];
|
||||
static int nbenchmarks;
|
||||
|
||||
void Benchmark::Register() {
|
||||
benchmarks[nbenchmarks] = this;
|
||||
if(lo < 1)
|
||||
lo = 1;
|
||||
if(hi < lo)
|
||||
hi = lo;
|
||||
nbenchmarks++;
|
||||
}
|
||||
|
||||
static int64 nsec() {
|
||||
struct timeval tv;
|
||||
if(gettimeofday(&tv, 0) < 0)
|
||||
return -1;
|
||||
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
|
||||
}
|
||||
|
||||
static int64 bytes;
|
||||
static int64 ns;
|
||||
static int64 t0;
|
||||
static int64 items;
|
||||
|
||||
void SetBenchmarkBytesProcessed(long long x) {
|
||||
bytes = x;
|
||||
}
|
||||
|
||||
void StopBenchmarkTiming() {
|
||||
if(t0 != 0)
|
||||
ns += nsec() - t0;
|
||||
t0 = 0;
|
||||
}
|
||||
|
||||
void StartBenchmarkTiming() {
|
||||
if(t0 == 0)
|
||||
t0 = nsec();
|
||||
}
|
||||
|
||||
void SetBenchmarkItemsProcessed(int n) {
|
||||
items = n;
|
||||
}
|
||||
|
||||
void BenchmarkMemoryUsage() {
|
||||
// TODO(rsc): Implement.
|
||||
}
|
||||
|
||||
int NumCPUs() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void runN(Benchmark *b, int n, int siz) {
|
||||
bytes = 0;
|
||||
items = 0;
|
||||
ns = 0;
|
||||
t0 = nsec();
|
||||
if(b->fn)
|
||||
b->fn(n);
|
||||
else if(b->fnr)
|
||||
b->fnr(n, siz);
|
||||
else {
|
||||
fprintf(stderr, "%s: missing function\n", b->name);
|
||||
exit(2);
|
||||
}
|
||||
if(t0 != 0)
|
||||
ns += nsec() - t0;
|
||||
}
|
||||
|
||||
static int round(int n) {
|
||||
int base = 1;
|
||||
|
||||
while(base*10 < n)
|
||||
base *= 10;
|
||||
if(n < 2*base)
|
||||
return 2*base;
|
||||
if(n < 5*base)
|
||||
return 5*base;
|
||||
return 10*base;
|
||||
}
|
||||
|
||||
void RunBench(Benchmark* b, int nthread, int siz) {
|
||||
int n, last;
|
||||
|
||||
// TODO(rsc): Threaded benchmarks.
|
||||
if(nthread != 1)
|
||||
return;
|
||||
|
||||
// run once in case it's expensive
|
||||
n = 1;
|
||||
runN(b, n, siz);
|
||||
while(ns < (int)1e9 && n < (int)1e9) {
|
||||
last = n;
|
||||
if(ns/n == 0)
|
||||
n = 1e9;
|
||||
else
|
||||
n = 1e9 / (ns/n);
|
||||
|
||||
n = max(last+1, min(n+n/2, 100*last));
|
||||
n = round(n);
|
||||
runN(b, n, siz);
|
||||
}
|
||||
|
||||
char mb[100];
|
||||
char suf[100];
|
||||
mb[0] = '\0';
|
||||
suf[0] = '\0';
|
||||
if(ns > 0 && bytes > 0)
|
||||
snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
|
||||
if(b->fnr || b->lo != b->hi) {
|
||||
if(siz >= (1<<20))
|
||||
snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
|
||||
else if(siz >= (1<<10))
|
||||
snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
|
||||
else
|
||||
snprintf(suf, sizeof suf, "/%d", siz);
|
||||
}
|
||||
printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
static int match(const char* name, int argc, const char** argv) {
|
||||
if(argc == 1)
|
||||
return 1;
|
||||
for(int i = 1; i < argc; i++)
|
||||
if(RE2::PartialMatch(name, argv[i]))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, const char** argv) {
|
||||
for(int i = 0; i < nbenchmarks; i++) {
|
||||
Benchmark* b = benchmarks[i];
|
||||
if(match(b->name, argc, argv))
|
||||
for(int j = b->threadlo; j <= b->threadhi; j++)
|
||||
for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
|
||||
RunBench(b, j, k);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_BENCHMARK_H__
|
||||
#define RE2_UTIL_BENCHMARK_H__
|
||||
|
||||
namespace testing {
|
||||
struct Benchmark {
|
||||
const char* name;
|
||||
void (*fn)(int);
|
||||
void (*fnr)(int, int);
|
||||
int lo;
|
||||
int hi;
|
||||
int threadlo;
|
||||
int threadhi;
|
||||
|
||||
void Register();
|
||||
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
|
||||
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
|
||||
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
|
||||
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
|
||||
};
|
||||
} // namespace testing
|
||||
|
||||
void SetBenchmarkBytesProcessed(long long);
|
||||
void StopBenchmarkTiming();
|
||||
void StartBenchmarkTiming();
|
||||
void BenchmarkMemoryUsage();
|
||||
void SetBenchmarkItemsProcessed(int);
|
||||
|
||||
int NumCPUs();
|
||||
|
||||
#define BENCHMARK(f) \
|
||||
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
|
||||
|
||||
#define BENCHMARK_RANGE(f, lo, hi) \
|
||||
::testing::Benchmark* _benchmark_##f = \
|
||||
(new ::testing::Benchmark(#f, f, lo, hi))
|
||||
|
||||
#endif // RE2_UTIL_BENCHMARK_H__
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Simplified version of Google's command line flags.
|
||||
// Does not support parsing the command line.
|
||||
// If you want to do that, see
|
||||
// http://code.google.com/p/google-gflags
|
||||
|
||||
#ifndef RE2_UTIL_FLAGS_H__
|
||||
#define RE2_UTIL_FLAGS_H__
|
||||
|
||||
#define DEFINE_flag(type, name, deflt, desc) \
|
||||
namespace re2 { type FLAGS_##name = deflt; }
|
||||
|
||||
#define DECLARE_flag(type, name) \
|
||||
namespace re2 { extern type FLAGS_##name; }
|
||||
|
||||
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
|
||||
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
|
||||
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
|
||||
|
||||
#define DECLARE_bool(name) DECLARE_flag(bool, name)
|
||||
#define DECLARE_int32(name) DECLARE_flag(int32, name)
|
||||
#define DECLARE_string(name) DECLARE_flag(string, name)
|
||||
|
||||
#endif // RE2_UTIL_FLAGS_H__
|
|
@ -0,0 +1,231 @@
|
|||
// Modified by Russ Cox to add "namespace re2".
|
||||
// Also threw away all but hashword and hashword2.
|
||||
// http://burtleburtle.net/bob/c/lookup3.c
|
||||
|
||||
/*
|
||||
-------------------------------------------------------------------------------
|
||||
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
|
||||
|
||||
These are functions for producing 32-bit hashes for hash table lookup.
|
||||
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
|
||||
are externally useful functions. Routines to test the hash are included
|
||||
if SELF_TEST is defined. You can use this free for any purpose. It's in
|
||||
the public domain. It has no warranty.
|
||||
|
||||
You probably want to use hashlittle(). hashlittle() and hashbig()
|
||||
hash byte arrays. hashlittle() is is faster than hashbig() on
|
||||
little-endian machines. Intel and AMD are little-endian machines.
|
||||
On second thought, you probably want hashlittle2(), which is identical to
|
||||
hashlittle() except it returns two 32-bit hashes for the price of one.
|
||||
You could implement hashbig2() if you wanted but I haven't bothered here.
|
||||
|
||||
If you want to find a hash of, say, exactly 7 integers, do
|
||||
a = i1; b = i2; c = i3;
|
||||
mix(a,b,c);
|
||||
a += i4; b += i5; c += i6;
|
||||
mix(a,b,c);
|
||||
a += i7;
|
||||
final(a,b,c);
|
||||
then use c as the hash value. If you have a variable length array of
|
||||
4-byte integers to hash, use hashword(). If you have a byte array (like
|
||||
a character string), use hashlittle(). If you have several byte arrays, or
|
||||
a mix of things, see the comments above hashlittle().
|
||||
|
||||
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
|
||||
then mix those integers. This is fast (you can do a lot more thorough
|
||||
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
|
||||
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
|
||||
|
||||
/*
|
||||
-------------------------------------------------------------------------------
|
||||
mix -- mix 3 32-bit values reversibly.
|
||||
|
||||
This is reversible, so any information in (a,b,c) before mix() is
|
||||
still in (a,b,c) after mix().
|
||||
|
||||
If four pairs of (a,b,c) inputs are run through mix(), or through
|
||||
mix() in reverse, there are at least 32 bits of the output that
|
||||
are sometimes the same for one pair and different for another pair.
|
||||
This was tested for:
|
||||
* pairs that differed by one bit, by two bits, in any combination
|
||||
of top bits of (a,b,c), or in any combination of bottom bits of
|
||||
(a,b,c).
|
||||
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
||||
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
||||
is commonly produced by subtraction) look like a single 1-bit
|
||||
difference.
|
||||
* the base values were pseudorandom, all zero but one bit set, or
|
||||
all zero plus a counter that starts at zero.
|
||||
|
||||
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
|
||||
satisfy this are
|
||||
4 6 8 16 19 4
|
||||
9 15 3 18 27 15
|
||||
14 9 3 7 17 3
|
||||
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
|
||||
for "differ" defined as + with a one-bit base and a two-bit delta. I
|
||||
used http://burtleburtle.net/bob/hash/avalanche.html to choose
|
||||
the operations, constants, and arrangements of the variables.
|
||||
|
||||
This does not achieve avalanche. There are input bits of (a,b,c)
|
||||
that fail to affect some output bits of (a,b,c), especially of a. The
|
||||
most thoroughly mixed value is c, but it doesn't really even achieve
|
||||
avalanche in c.
|
||||
|
||||
This allows some parallelism. Read-after-writes are good at doubling
|
||||
the number of bits affected, so the goal of mixing pulls in the opposite
|
||||
direction as the goal of parallelism. I did what I could. Rotates
|
||||
seem to cost as much as shifts on every machine I could lay my hands
|
||||
on, and rotates are much kinder to the top and bottom bits, so I used
|
||||
rotates.
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
#define mix(a,b,c) \
|
||||
{ \
|
||||
a -= c; a ^= rot(c, 4); c += b; \
|
||||
b -= a; b ^= rot(a, 6); a += c; \
|
||||
c -= b; c ^= rot(b, 8); b += a; \
|
||||
a -= c; a ^= rot(c,16); c += b; \
|
||||
b -= a; b ^= rot(a,19); a += c; \
|
||||
c -= b; c ^= rot(b, 4); b += a; \
|
||||
}
|
||||
|
||||
/*
|
||||
-------------------------------------------------------------------------------
|
||||
final -- final mixing of 3 32-bit values (a,b,c) into c
|
||||
|
||||
Pairs of (a,b,c) values differing in only a few bits will usually
|
||||
produce values of c that look totally different. This was tested for
|
||||
* pairs that differed by one bit, by two bits, in any combination
|
||||
of top bits of (a,b,c), or in any combination of bottom bits of
|
||||
(a,b,c).
|
||||
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
||||
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
||||
is commonly produced by subtraction) look like a single 1-bit
|
||||
difference.
|
||||
* the base values were pseudorandom, all zero but one bit set, or
|
||||
all zero plus a counter that starts at zero.
|
||||
|
||||
These constants passed:
|
||||
14 11 25 16 4 14 24
|
||||
12 14 25 16 4 14 24
|
||||
and these came close:
|
||||
4 8 15 26 3 22 24
|
||||
10 8 15 26 3 22 24
|
||||
11 8 15 26 3 22 24
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
#define final(a,b,c) \
|
||||
{ \
|
||||
c ^= b; c -= rot(b,14); \
|
||||
a ^= c; a -= rot(c,11); \
|
||||
b ^= a; b -= rot(a,25); \
|
||||
c ^= b; c -= rot(b,16); \
|
||||
a ^= c; a -= rot(c,4); \
|
||||
b ^= a; b -= rot(a,14); \
|
||||
c ^= b; c -= rot(b,24); \
|
||||
}
|
||||
|
||||
namespace re2 {
|
||||
|
||||
/*
|
||||
--------------------------------------------------------------------
|
||||
This works on all machines. To be useful, it requires
|
||||
-- that the key be an array of uint32_t's, and
|
||||
-- that the length be the number of uint32_t's in the key
|
||||
|
||||
The function hashword() is identical to hashlittle() on little-endian
|
||||
machines, and identical to hashbig() on big-endian machines,
|
||||
except that the length has to be measured in uint32_ts rather than in
|
||||
bytes. hashlittle() is more complicated than hashword() only because
|
||||
hashlittle() has to dance around fitting the key bytes into registers.
|
||||
--------------------------------------------------------------------
|
||||
*/
|
||||
uint32 hashword(
|
||||
const uint32 *k, /* the key, an array of uint32_t values */
|
||||
size_t length, /* the length of the key, in uint32_ts */
|
||||
uint32 initval) /* the previous hash, or an arbitrary value */
|
||||
{
|
||||
uint32_t a,b,c;
|
||||
|
||||
/* Set up the internal state */
|
||||
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
|
||||
|
||||
/*------------------------------------------------- handle most of the key */
|
||||
while (length > 3)
|
||||
{
|
||||
a += k[0];
|
||||
b += k[1];
|
||||
c += k[2];
|
||||
mix(a,b,c);
|
||||
length -= 3;
|
||||
k += 3;
|
||||
}
|
||||
|
||||
/*------------------------------------------- handle the last 3 uint32_t's */
|
||||
switch(length) /* all the case statements fall through */
|
||||
{
|
||||
case 3 : c+=k[2];
|
||||
case 2 : b+=k[1];
|
||||
case 1 : a+=k[0];
|
||||
final(a,b,c);
|
||||
case 0: /* case 0: nothing left to add */
|
||||
break;
|
||||
}
|
||||
/*------------------------------------------------------ report the result */
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
--------------------------------------------------------------------
|
||||
hashword2() -- same as hashword(), but take two seeds and return two
|
||||
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
|
||||
both be initialized with seeds. If you pass in (*pb)==0, the output
|
||||
(*pc) will be the same as the return value from hashword().
|
||||
--------------------------------------------------------------------
|
||||
*/
|
||||
void hashword2 (
|
||||
const uint32 *k, /* the key, an array of uint32_t values */
|
||||
size_t length, /* the length of the key, in uint32_ts */
|
||||
uint32 *pc, /* IN: seed OUT: primary hash value */
|
||||
uint32 *pb) /* IN: more seed OUT: secondary hash value */
|
||||
{
|
||||
uint32_t a,b,c;
|
||||
|
||||
/* Set up the internal state */
|
||||
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
|
||||
c += *pb;
|
||||
|
||||
/*------------------------------------------------- handle most of the key */
|
||||
while (length > 3)
|
||||
{
|
||||
a += k[0];
|
||||
b += k[1];
|
||||
c += k[2];
|
||||
mix(a,b,c);
|
||||
length -= 3;
|
||||
k += 3;
|
||||
}
|
||||
|
||||
/*------------------------------------------- handle the last 3 uint32_t's */
|
||||
switch(length) /* all the case statements fall through */
|
||||
{
|
||||
case 3 : c+=k[2];
|
||||
case 2 : b+=k[1];
|
||||
case 1 : a+=k[0];
|
||||
final(a,b,c);
|
||||
case 0: /* case 0: nothing left to add */
|
||||
break;
|
||||
}
|
||||
/*------------------------------------------------------ report the result */
|
||||
*pc=c; *pb=b;
|
||||
}
|
||||
|
||||
} // namespace re2
|
|
@ -0,0 +1,78 @@
|
|||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Simplified version of Google's logging.
|
||||
|
||||
#ifndef RE2_UTIL_LOGGING_H__
|
||||
#define RE2_UTIL_LOGGING_H__
|
||||
|
||||
#include <unistd.h> /* for write */
|
||||
#include <sstream>
|
||||
|
||||
// Debug-only checking.
|
||||
#define DCHECK(condition) assert(condition)
|
||||
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
|
||||
#define DCHECK_NE(val1, val2) assert((val1) != (val2))
|
||||
#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
|
||||
#define DCHECK_LT(val1, val2) assert((val1) < (val2))
|
||||
#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
|
||||
#define DCHECK_GT(val1, val2) assert((val1) > (val2))
|
||||
|
||||
// Always-on checking
|
||||
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
|
||||
#define CHECK_LT(x, y) CHECK((x) < (y))
|
||||
#define CHECK_GT(x, y) CHECK((x) > (y))
|
||||
#define CHECK_LE(x, y) CHECK((x) <= (y))
|
||||
#define CHECK_GE(x, y) CHECK((x) >= (y))
|
||||
#define CHECK_EQ(x, y) CHECK((x) == (y))
|
||||
#define CHECK_NE(x, y) CHECK((x) != (y))
|
||||
|
||||
#define LOG_INFO LogMessage(__FILE__, __LINE__)
|
||||
#define LOG_ERROR LOG_INFO
|
||||
#define LOG_WARNING LOG_INFO
|
||||
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
|
||||
#define LOG_QFATAL LOG_FATAL
|
||||
|
||||
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
|
||||
|
||||
#ifdef NDEBUG
|
||||
#define DEBUG_MODE 0
|
||||
#define LOG_DFATAL LOG_ERROR
|
||||
#else
|
||||
#define DEBUG_MODE 1
|
||||
#define LOG_DFATAL LOG_FATAL
|
||||
#endif
|
||||
|
||||
#define LOG(severity) LOG_ ## severity.stream()
|
||||
|
||||
class LogMessage {
|
||||
public:
|
||||
LogMessage(const char* file, int line) {
|
||||
stream() << file << ":" << line << ": ";
|
||||
}
|
||||
~LogMessage() {
|
||||
stream() << "\n";
|
||||
string s = str_.str();
|
||||
if(write(2, s.data(), s.size()) < 0) {} // shut up gcc
|
||||
}
|
||||
ostream& stream() { return str_; }
|
||||
|
||||
private:
|
||||
std::ostringstream str_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
|
||||
};
|
||||
|
||||
class LogMessageFatal : public LogMessage {
|
||||
public:
|
||||
LogMessageFatal(const char* file, int line)
|
||||
: LogMessage(file, line) { }
|
||||
~LogMessageFatal() {
|
||||
std::cerr << "\n";
|
||||
abort();
|
||||
}
|
||||
private:
|
||||
DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
|
||||
};
|
||||
|
||||
#endif // RE2_UTIL_LOGGING_H__
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue