master_lexer.cc 12.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (C) 2012  Internet Systems Consortium, Inc. ("ISC")
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THIS SOFTWARE.

15 16
#include <exceptions/exceptions.h>

17
#include <dns/master_lexer.h>
18
#include <dns/master_lexer_inputsource.h>
19
#include <dns/master_lexer_state.h>
20

21 22
#include <boost/shared_ptr.hpp>

23
#include <bitset>
24 25
#include <cassert>
#include <string>
26 27 28 29 30
#include <vector>

namespace isc {
namespace dns {

31 32
namespace {
typedef boost::shared_ptr<master_lexer_internal::InputSource> InputSourcePtr;
33 34 35 36
}
using namespace master_lexer_internal;

struct MasterLexer::MasterLexerImpl {
37 38
    MasterLexerImpl() : source_(NULL), token_(Token::NOT_STARTED),
                        paren_count_(0), last_was_eol_(false)
39 40 41 42 43 44 45 46 47 48
    {
        separators_.set('\r');
        separators_.set('\n');
        separators_.set(' ');
        separators_.set('\t');
        separators_.set('(');
        separators_.set(')');
        esc_separators_.set('\r');
        esc_separators_.set('\n');
    }
49

50 51 52 53
    // A helper method to skip possible comments toward the end of EOL or EOF.
    // commonly used by state classes.  It returns the corresponding "end-of"
    // character in case it's a comment; otherwise it simply returns the
    // current character.
54 55
    int skipComment(int c, bool escaped = false) {
        if (c == ';' && !escaped) {
56 57 58 59 60 61 62 63 64
            while (true) {
                c = source_->getChar();
                if (c == '\n' || c == InputSource::END_OF_STREAM) {
                    return (c);
                }
            }
        }
        return (c);
    }
65

66
    bool isTokenEnd(int c, bool escaped) {
67
        // Special case of EOF (end of stream); this is not in the bitmaps
68 69 70
        if (c == InputSource::END_OF_STREAM) {
            return (true);
        }
71 72 73
        // In this implementation we only ensure the behavior for unsigned
        // range of characters, so we restrict the range of the values up to
        // 0x7f = 127
74 75 76 77
        return (escaped ? esc_separators_.test(c & 0x7f) :
                separators_.test(c & 0x7f));
    }

78
    std::vector<InputSourcePtr> sources_;
79 80
    InputSource* source_;       // current source (NULL if sources_ is empty)
    Token token_;               // currently recognized token (set by a state)
81
    std::vector<char> data_;    // placeholder for string data
82 83 84 85 86

    // These are used in states, and defined here only as a placeholder.
    // The main lexer class does not need these members.
    size_t paren_count_;        // nest count of the parentheses
    bool last_was_eol_; // whether the lexer just passed an end-of-line
87 88 89 90

    // Bitmaps that gives whether a given (positive) character should be
    // considered a separator of a string/number token.  The esc_ version
    // is a subset of the other, excluding characters that can be ignored
91
    // if escaped by a backslash.  See isTokenEnd() for the bitmap size.
92 93
    std::bitset<128> separators_;
    std::bitset<128> esc_separators_;
94 95 96 97 98 99 100 101 102
};

MasterLexer::MasterLexer() : impl_(new MasterLexerImpl) {
}

MasterLexer::~MasterLexer() {
    delete impl_;
}

103 104
bool
MasterLexer::pushSource(const char* filename, std::string* error) {
JINMEI Tatuya's avatar
JINMEI Tatuya committed
105
    if (filename == NULL) {
106 107
        isc_throw(InvalidParameter,
                  "NULL filename for MasterLexer::pushSource");
JINMEI Tatuya's avatar
JINMEI Tatuya committed
108
    }
109 110 111 112 113 114 115 116 117
    try {
        impl_->sources_.push_back(InputSourcePtr(new InputSource(filename)));
    } catch (const InputSource::OpenError& ex) {
        if (error != NULL) {
            *error = ex.what();
        }
        return (false);
    }

118
    impl_->source_ = impl_->sources_.back().get();
119
    return (true);
120 121
}

122
void
123
MasterLexer::pushSource(std::istream& input) {
124
    impl_->sources_.push_back(InputSourcePtr(new InputSource(input)));
125
    impl_->source_ = impl_->sources_.back().get();
126 127
}

128
void
129
MasterLexer::popSource() {
130
    if (impl_->sources_.empty()) {
131 132
        isc_throw(InvalidOperation,
                  "MasterLexer::popSource on an empty source");
133 134
    }
    impl_->sources_.pop_back();
135 136
    impl_->source_ = impl_->sources_.empty() ? NULL :
        impl_->sources_.back().get();
137 138
}

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
std::string
MasterLexer::getSourceName() const {
    if (impl_->sources_.empty()) {
        return (std::string());
    }
    return (impl_->sources_.back()->getName());
}

size_t
MasterLexer::getSourceLine() const {
    if (impl_->sources_.empty()) {
        return (0);
    }
    return (impl_->sources_.back()->getCurrentLine());
}
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176

namespace {
const char* const error_text[] = {
    "lexer not started",        // NOT_STARTED
    "unbalanced parentheses",   // UNBALANCED_PAREN
    "unexpected end of input",  // UNEXPECTED_END
    "unbalanced quotes"         // UNBALANCED_QUOTES
};
const size_t error_text_max_count = sizeof(error_text) / sizeof(error_text[0]);
}

std::string
MasterLexer::Token::getErrorText() const {
    if (type_ != ERROR) {
        isc_throw(InvalidOperation,
                  "Token::getErrorText() for non error type");
    }

    // The class integrity ensures the following:
    assert(val_.error_code_ < error_text_max_count);
    return (error_text[val_.error_code_]);
}

177
namespace master_lexer_internal {
178 179
// Below we implement state classes for state transitions of MasterLexer.
// Note that these need to be defined here so that they can refer to
180
// the details of MasterLexerImpl.
181

182 183 184
typedef MasterLexer::Token Token; // convenience shortcut

bool
185
State::wasLastEOL(const MasterLexer& lexer) const {
186 187 188
    return (lexer.impl_->last_was_eol_);
}

189 190
const MasterLexer::Token&
State::getToken(const MasterLexer& lexer) const {
191 192 193
    return (lexer.impl_->token_);
}

194 195 196 197 198
size_t
State::getParenCount(const MasterLexer& lexer) const {
    return (lexer.impl_->paren_count_);
}

199
namespace {
200 201 202
class CRLF : public State {
public:
    CRLF() {}
203
    virtual ~CRLF() {}          // see the base class for the destructor
204
    virtual const State* handle(MasterLexer& lexer) const {
JINMEI Tatuya's avatar
JINMEI Tatuya committed
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
        // We've just seen '\r'.  If this is part of a sequence of '\r\n',
        // we combine them as a single END-OF-LINE.  Otherwise we treat the
        // single '\r' as an EOL and continue tokeniziation from the character
        // immediately after '\r'.  One tricky case is that there's a comment
        // between '\r' and '\n'.  This implementation combines these
        // characters and treats them as a single EOL (the behavior derived
        // from BIND 9).  Technically this may not be correct, but in practice
        // the caller wouldn't distinguish this case from the case it has
        // two EOLs, so we simplify the process.
        const int c = getLexerImpl(lexer)->skipComment(
            getLexerImpl(lexer)->source_->getChar());
        if (c != '\n') {
            getLexerImpl(lexer)->source_->ungetChar();
        }
        getLexerImpl(lexer)->token_ = Token(Token::END_OF_LINE);
        getLexerImpl(lexer)->last_was_eol_ = true;
221 222 223 224
        return (NULL);
    }
};

225 226 227
class String : public State {
public:
    String() {}
228
    virtual ~String() {}      // see the base class for the destructor
229
    virtual const State* handle(MasterLexer& lexer) const;
230 231
};

232 233 234 235 236 237 238
class QString : public State {
public:
    QString() {}
    virtual ~QString() {}      // see the base class for the destructor
    virtual const State* handle(MasterLexer& lexer) const;
};

239 240 241 242 243
// We use a common instance of a each state in a singleton-like way to save
// construction overhead.  They are not singletons in its strict sense as
// we don't prohibit direct construction of these objects.  But that doesn't
// matter much anyway, because the definitions are completely hidden within
// this file.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
244
const CRLF CRLF_STATE;
245
const String STRING_STATE;
246
const QString QSTRING_STATE;
247 248 249
}

const State&
250 251 252 253 254
State::getInstance(ID state_id) {
    switch (state_id) {
    case CRLF:
        return (CRLF_STATE);
    case String:
255
        return (STRING_STATE);
256 257
    case QString:
        return (QSTRING_STATE);
258
    }
259 260 261 262 263 264

    // This is a bug of the caller, and this method is only expected to be
    // used by tests, so we just forcefully make it fail by asserting the
    // condition.
    assert(false);
    return (STRING_STATE); // a dummy return, to silence some compilers.
265 266
}

JINMEI Tatuya's avatar
JINMEI Tatuya committed
267
const State*
268 269 270 271
State::start(MasterLexer& lexer, MasterLexer::Options options) {
    // define some shortcuts
    MasterLexer::MasterLexerImpl& lexerimpl = *lexer.impl_;
    size_t& paren_count = lexerimpl.paren_count_;
272

273 274 275
    // Note: the if-else in the loop is getting complicated.  When we complete
    // #2374, revisit the organization to see if we need a fundamental
    // refactoring.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
276
    while (true) {
277
        const int c = lexerimpl.skipComment(lexerimpl.source_->getChar());
278
        if (c == InputSource::END_OF_STREAM) {
279
            lexerimpl.last_was_eol_ = false;
280
            if (paren_count != 0) {
281
                lexerimpl.token_ = Token(Token::UNBALANCED_PAREN);
282 283 284
                paren_count = 0; // reset to 0; this helps in lenient mode.
                return (NULL);
            }
285
            lexerimpl.token_ = Token(Token::END_OF_FILE);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
286 287
            return (NULL);
        } else if (c == ' ' || c == '\t') {
288
            // If requested and we are not in (), recognize the initial space.
289
            if (lexerimpl.last_was_eol_ && paren_count == 0 &&
JINMEI Tatuya's avatar
JINMEI Tatuya committed
290
                (options & MasterLexer::INITIAL_WS) != 0) {
291 292
                lexerimpl.last_was_eol_ = false;
                lexerimpl.token_ = Token(Token::INITIAL_WS);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
293 294 295
                return (NULL);
            }
        } else if (c == '\n') {
296
            lexerimpl.last_was_eol_ = true;
297
            if (paren_count == 0) { // we don't recognize EOL if we are in ()
298
                lexerimpl.token_ = Token(Token::END_OF_LINE);
299 300
                return (NULL);
            }
JINMEI Tatuya's avatar
JINMEI Tatuya committed
301
        } else if (c == '\r') {
302
            if (paren_count == 0) { // check if we are in () (see above)
JINMEI Tatuya's avatar
JINMEI Tatuya committed
303 304
                return (&CRLF_STATE);
            }
305 306 307
        } else if (c == '"' && (options & MasterLexer::QSTRING) != 0) {
            lexerimpl.last_was_eol_ = false;
            return (&QSTRING_STATE);
308
        } else if (c == '(') {
309
            lexerimpl.last_was_eol_ = false;
310
            ++paren_count;
311
        } else if (c == ')') {
312
            lexerimpl.last_was_eol_ = false;
313
            if (paren_count == 0) {
314
                lexerimpl.token_ = Token(Token::UNBALANCED_PAREN);
315 316
                return (NULL);
            }
317
            --paren_count;
318
        } else {
JINMEI Tatuya's avatar
JINMEI Tatuya committed
319
            // this character will be handled in the string state
320
            lexerimpl.source_->ungetChar();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
321
            lexerimpl.last_was_eol_ = false;
322
            return (&STRING_STATE);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
323
        }
324
        // no code should be here; we just continue the loop.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
325 326
    }
}
327

328 329 330 331 332
const State*
String::handle(MasterLexer& lexer) const {
    std::vector<char>& data = getLexerImpl(lexer)->data_;
    data.clear();

333
    bool escaped = false;
334
    while (true) {
335 336
        const int c = getLexerImpl(lexer)->skipComment(
            getLexerImpl(lexer)->source_->getChar(), escaped);
337

338
        if (getLexerImpl(lexer)->isTokenEnd(c, escaped)) {
339
            getLexerImpl(lexer)->source_->ungetChar();
340 341
            getLexerImpl(lexer)->token_ =
                MasterLexer::Token(&data.at(0), data.size());
342 343
            return (NULL);
        }
344
        escaped = (c == '\\' && !escaped);
345 346 347 348
        data.push_back(c);
    }
}

349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
const State*
QString::handle(MasterLexer& lexer) const {
    MasterLexer::Token& token = getLexerImpl(lexer)->token_;
    std::vector<char>& data = getLexerImpl(lexer)->data_;
    data.clear();

    bool escaped = false;
    while (true) {
        const int c = getLexerImpl(lexer)->source_->getChar();
        if (c == InputSource::END_OF_STREAM) {
            token = Token(Token::UNEXPECTED_END);
            return (NULL);
        } else if (c == '"') {
            if (escaped) {
                // found escaped '"'. overwrite the preceding backslash.
                assert(!data.empty());
                escaped = false;
                data.back() = '"';
            } else {
                token = MasterLexer::Token(&data.at(0), data.size(), true);
                return (NULL);
            }
        } else if (c == '\n' && !escaped) {
            getLexerImpl(lexer)->source_->ungetChar();
            token = Token(Token::UNBALANCED_QUOTES);
            return (NULL);
        } else {
            escaped = (c == '\\' && !escaped);
            data.push_back(c);
        }
    }
}

382 383
} // namespace master_lexer_internal

384 385
} // end of namespace dns
} // end of namespace isc