From 220ca84223dca5aa7a58c1a941d745c1387d29be Mon Sep 17 00:00:00 2001 From: Lee Campbell Date: Thu, 30 Jul 2015 09:27:11 -0700 Subject: [PATCH] init: Add C++ tokenizer. Adds a C++ tokenizer along with unit tests. This tokenizer will replace the current C implementation which does a poor job of keeping track of pointers. This CL is a prerequisite for up coming changes to the parser. This CL does not wire up this tokenizer and changes no exsiting code. All that builds is the unit tests. Change-Id: Iec3740bce7153640adc5e5bbdc57e644cedf0038 TEST: Unit tests all pass. No leaks under valgrind BUG: 22843198 --- init/Android.mk | 21 +++ init/parser/tokenizer.cpp | 129 ++++++++++++++++++ init/parser/tokenizer.h | 69 ++++++++++ init/parser/tokenizer_test.cpp | 230 +++++++++++++++++++++++++++++++++ 4 files changed, 449 insertions(+) create mode 100644 init/parser/tokenizer.cpp create mode 100644 init/parser/tokenizer.h create mode 100644 init/parser/tokenizer_test.cpp diff --git a/init/Android.mk b/init/Android.mk index 45b002de5..1611b8111 100644 --- a/init/Android.mk +++ b/init/Android.mk @@ -20,6 +20,27 @@ init_cflags += \ # -- +# If building on Linux, then build unit test for the host. +ifeq ($(HOST_OS),linux) +include $(CLEAR_VARS) +LOCAL_CPPFLAGS := $(init_cflags) +LOCAL_SRC_FILES:= \ + parser/tokenizer.cpp \ + +LOCAL_MODULE := libinit_parser +LOCAL_CLANG := true +include $(BUILD_HOST_STATIC_LIBRARY) + +include $(CLEAR_VARS) +LOCAL_MODULE := init_parser_tests +LOCAL_SRC_FILES := \ + parser/tokenizer_test.cpp \ + +LOCAL_STATIC_LIBRARIES := libinit_parser +LOCAL_CLANG := true +include $(BUILD_HOST_NATIVE_TEST) +endif + include $(CLEAR_VARS) LOCAL_CPPFLAGS := $(init_cflags) LOCAL_SRC_FILES:= \ diff --git a/init/parser/tokenizer.cpp b/init/parser/tokenizer.cpp new file mode 100644 index 000000000..340e0d9ae --- /dev/null +++ b/init/parser/tokenizer.cpp @@ -0,0 +1,129 @@ +// Copyright (C) 2015 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tokenizer.h" + +namespace init { + +Tokenizer::Tokenizer(const std::string& data) + : data_(data), eof_(false), pos_(0), tok_start_(0) { + current_.type = TOK_START; + + if (data.size() > 0) { + cur_char_ = data[0]; + } else { + eof_ = true; + cur_char_ = '\0'; + } +} + +Tokenizer::~Tokenizer() {} + +const Tokenizer::Token& Tokenizer::current() { + return current_; +} + +bool Tokenizer::Next() { + while (!eof_) { + AdvWhiteSpace(); + + // Check for comments. + if (cur_char_ == '#') { + AdvChar(); + // Skip rest of line + while (!eof_ && cur_char_ != '\n') { + AdvChar(); + } + } + + if (eof_) { + break; + } + + if (cur_char_ == '\0') { + AdvChar(); + } else if (cur_char_ == '\n') { + current_.type = TOK_NEWLINE; + current_.text.clear(); + AdvChar(); + return true; + } else if (cur_char_ == '\\') { + AdvChar(); // skip backslash + // This is line continuation so + // do not generated TOK_NEWLINE at + // the next \n. + AdvUntil('\n'); + AdvChar(); // skip \n + } else if (cur_char_ == '\"') { + AdvChar(); + StartText(); + // Grab everything until the next quote. + AdvUntil('\"'); + EndText(); + AdvChar(); // skip quote. + return true; + } else { + StartText(); + AdvText(); + EndText(); + return true; + } + } + current_.type = TOK_END; + current_.text.clear(); + return false; +} + +void Tokenizer::AdvChar() { + pos_++; + if (pos_ < data_.size()) { + cur_char_ = data_[pos_]; + } else { + eof_ = true; + cur_char_ = '\0'; + } +} + +void Tokenizer::AdvWhiteSpace() { + while (cur_char_ == '\t' || cur_char_ == '\r' || cur_char_ == ' ') { + AdvChar(); + } +} + +void Tokenizer::AdvUntil(char x) { + while (!eof_ && cur_char_ != x) { + AdvChar(); + } +} + +void Tokenizer::AdvText() { + while (cur_char_ != '\t' && cur_char_ != '\r' && cur_char_ != '\0' && + cur_char_ != ' ' && cur_char_ != '\n' && cur_char_ != '#') { + AdvChar(); + } +} + +void Tokenizer::StartText() { + current_.text.clear(); + tok_start_ = pos_; + current_.type = TOK_TEXT; +} + +void Tokenizer::EndText() { + if (pos_ != tok_start_) { + current_.text.append(data_, tok_start_, pos_ - tok_start_); + } +} + +} // namespace init \ No newline at end of file diff --git a/init/parser/tokenizer.h b/init/parser/tokenizer.h new file mode 100644 index 000000000..40a22b1af --- /dev/null +++ b/init/parser/tokenizer.h @@ -0,0 +1,69 @@ +// Copyright (C) 2015 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace init { + +// Used to tokenize a std::string. +// Call Next() to advance through each token until it returns false, +// indicating there are no more tokens left in the string. +// The current token can be accessed with current(), which returns +// a Token. +// Supported tokens are: +// TOK_START - Next() has yet to be called +// TOK_END - At the end of string +// TOK_NEWLINE - The end of a line denoted by \n. +// TOK_TEXT - A word. +// Comments are denoted with '#' and the tokenizer will ignore +// the rest of the line. +// Double quotes can be used to insert whitespace into words. +// A backslash at the end of a line denotes continuation and +// a TOK_NEWLINE will not be generated for that line. +class Tokenizer { + public: + Tokenizer(const std::string& data); + ~Tokenizer(); + + enum TokenType { TOK_START, TOK_END, TOK_NEWLINE, TOK_TEXT }; + struct Token { + TokenType type; + std::string text; + }; + + // Returns the curret token. + const Token& current(); + + // Move to the next token, returns false at the end of input. + bool Next(); + + private: + void GetData(); + void AdvChar(); + void AdvText(); + void AdvUntil(char x); + void AdvWhiteSpace(); + void StartText(); + void EndText(); + + const std::string& data_; + Token current_; + + bool eof_; + size_t pos_; + char cur_char_; + size_t tok_start_; +}; + +} // namespace init diff --git a/init/parser/tokenizer_test.cpp b/init/parser/tokenizer_test.cpp new file mode 100644 index 000000000..c4a48df8c --- /dev/null +++ b/init/parser/tokenizer_test.cpp @@ -0,0 +1,230 @@ +// Copyright (C) 2015 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tokenizer.h" + +#include +#include + +#include + +namespace init { + +#define SETUP_TEST(test_data) \ + std::string data(test_data); \ + Tokenizer tokenizer(data); \ + ASSERT_EQ(Tokenizer::TOK_START, tokenizer.current().type) + +#define ASSERT_TEXT_TOKEN(test_text) \ + ASSERT_TRUE(tokenizer.Next()); \ + ASSERT_EQ(test_text, tokenizer.current().text); \ + ASSERT_EQ(Tokenizer::TOK_TEXT, tokenizer.current().type) + +#define ASSERT_NEWLINE_TOKEN() \ + ASSERT_TRUE(tokenizer.Next()); \ + ASSERT_EQ(Tokenizer::TOK_NEWLINE, tokenizer.current().type) + +TEST(Tokenizer, Empty) { + SETUP_TEST(""); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, Simple) { + SETUP_TEST("test"); + ASSERT_TEXT_TOKEN("test"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, LeadingWhiteSpace) { + SETUP_TEST(" \t \r test"); + ASSERT_TEXT_TOKEN("test"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, TrailingWhiteSpace) { + SETUP_TEST("test \t \r "); + ASSERT_TEXT_TOKEN("test"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, WhiteSpace) { + SETUP_TEST(" \t \r test \t \r "); + ASSERT_TEXT_TOKEN("test"); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, TwoTokens) { + SETUP_TEST(" foo bar "); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("bar"); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, MultiToken) { + SETUP_TEST("one two three four five"); + ASSERT_TEXT_TOKEN("one"); + ASSERT_TEXT_TOKEN("two"); + ASSERT_TEXT_TOKEN("three"); + ASSERT_TEXT_TOKEN("four"); + ASSERT_TEXT_TOKEN("five"); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, NewLine) { + SETUP_TEST("\n"); + ASSERT_NEWLINE_TOKEN(); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, TextNewLine) { + SETUP_TEST("test\n"); + ASSERT_TEXT_TOKEN("test"); + ASSERT_NEWLINE_TOKEN(); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, MultiTextNewLine) { + SETUP_TEST("one\ntwo\nthree\n"); + ASSERT_TEXT_TOKEN("one"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_TEXT_TOKEN("two"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_TEXT_TOKEN("three"); + ASSERT_NEWLINE_TOKEN(); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, MultiTextNewLineNoLineEnding) { + SETUP_TEST("one\ntwo\nthree"); + ASSERT_TEXT_TOKEN("one"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_TEXT_TOKEN("two"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_TEXT_TOKEN("three"); + + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, Comment) { + SETUP_TEST("#test"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentWhiteSpace) { + SETUP_TEST(" \t \r #test \t \r "); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentNewLine) { + SETUP_TEST(" #test \n"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentTwoNewLine) { + SETUP_TEST(" #test \n#test"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentWithText) { + SETUP_TEST("foo bar #test"); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentWithTextNoSpace) { + SETUP_TEST("foo bar#test"); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentWithTextLineFeed) { + SETUP_TEST("foo bar #test\n"); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, CommentWithMultiTextLineFeed) { + SETUP_TEST("#blah\nfoo bar #test\n#blah"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_NEWLINE_TOKEN(); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, SimpleEscaped) { + SETUP_TEST("fo\\no bar"); + ASSERT_TEXT_TOKEN("fo\\no"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, EscapedLineContNoLineFeed) { + SETUP_TEST("fo\\no bar \\ hello"); + ASSERT_TEXT_TOKEN("fo\\no"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, EscapedLineContLineFeed) { + SETUP_TEST("fo\\no bar \\ hello\n"); + ASSERT_TEXT_TOKEN("fo\\no"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, EscapedLineCont) { + SETUP_TEST("fo\\no bar \\\ntest"); + ASSERT_TEXT_TOKEN("fo\\no"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_TEXT_TOKEN("test"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, EscapedLineContWithBadChars) { + SETUP_TEST("fo\\no bar \\bad bad bad\ntest"); + ASSERT_TEXT_TOKEN("fo\\no"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_TEXT_TOKEN("test"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, SimpleQuotes) { + SETUP_TEST("foo \"single token\" bar"); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("single token"); + ASSERT_TEXT_TOKEN("bar"); + ASSERT_FALSE(tokenizer.Next()); +} + +TEST(Tokenizer, BadQuotes) { + SETUP_TEST("foo \"single token"); + ASSERT_TEXT_TOKEN("foo"); + ASSERT_TEXT_TOKEN("single token"); + ASSERT_FALSE(tokenizer.Next()); +} + +} // namespace init