master_lexer.h 15.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Copyright (C) 2012  Internet Systems Consortium, Inc. ("ISC")
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THIS SOFTWARE.

#ifndef MASTER_LEXER_H
#define MASTER_LEXER_H 1

#include <exceptions/exceptions.h>

20
#include <istream>
21
22
23
24
25
26
#include <string>

#include <stdint.h>

namespace isc {
namespace dns {
27
28
29
namespace master_lexer_internal {
class State;
}
30

JINMEI Tatuya's avatar
JINMEI Tatuya committed
31
32
33
34
/// \brief Tokenizer for parsing DNS master files.
///
/// The \c MasterLexer class provides tokenize interfaces for parsing DNS
/// master files.  It understands some special rules of master files as
35
/// defined in RFC 1035, such as comments, character escaping, and multi-line
JINMEI Tatuya's avatar
JINMEI Tatuya committed
36
37
38
39
40
/// data, and provides the user application with the actual data in a
/// more convenient form such as a std::string object.
///
/// In order to support the $INCLUDE notation, this class is designed to be
/// able to operate on multiple files or input streams in the nested way.
41
42
/// The \c pushSource() and \c popSource() methods correspond to the push
/// and pop operations.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
43
44
45
46
47
///
/// While this class is public, it is less likely to be used by normal
/// applications; it's mainly expected to be used within this library,
/// specifically by the \c MasterLoader class and \c Rdata implementation
/// classes.
48
class MasterLexer {
49
    friend class master_lexer_internal::State;
50
public:
JINMEI Tatuya's avatar
JINMEI Tatuya committed
51
    class Token;       // we define it separately for better readability
52

53
54
55
56
    /// \brief Options for getNextToken.
    ///
    /// A compound option, indicating multiple options are set, can be
    /// specified using the logical OR operator (operator|()).
JINMEI Tatuya's avatar
JINMEI Tatuya committed
57
    enum Options {
58
59
60
        NONE = 0,               ///< No option
        INITIAL_WS = 1, ///< recognize begin-of-line spaces after an
                        ///< end-of-line
61
62
        QSTRING = 2,    ///< recognize quoted string
        NUMBER = 4   ///< recognize numeric text as integer
JINMEI Tatuya's avatar
JINMEI Tatuya committed
63
64
    };

JINMEI Tatuya's avatar
JINMEI Tatuya committed
65
66
67
    /// \brief The constructor.
    ///
    /// \throw std::bad_alloc Internal resource allocation fails (rare case).
68
    MasterLexer();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
69
70
71
72

    /// \brief The destructor.
    ///
    /// It internally closes any remaining input sources.
73
    ~MasterLexer();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
74
75
76

    /// \brief Open a file and make it the current input source of MasterLexer.
    ///
77
78
79
    /// The opened file can be explicitly closed by the \c popSource() method;
    /// if \c popSource() is not called within the lifetime of the
    /// \c MasterLexer, it will be closed in the destructor.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
80
    ///
81
82
83
84
85
86
87
    /// In the case possible system errors in opening the file (most likely
    /// because of specifying a non-existent or unreadable file), it returns
    /// false, and if the optional \c error parameter is non NULL, it will be
    /// set to a description of the error (any existing content of the string
    /// will be discarded).  If opening the file succeeds, the given
    /// \c error parameter will be intact.
    ///
JINMEI Tatuya's avatar
JINMEI Tatuya committed
88
89
    /// \throw InvalidParameter filename is NULL
    /// \param filename A non NULL string specifying a master file
90
91
92
93
94
    /// \param error If non null, a placeholder to set error description in
    /// case of failure.
    ///
    /// \return true if pushing the file succeeds; false otherwise.
    bool pushSource(const char* filename, std::string* error = NULL);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
95
96
97
98
99
100
101

    /// \brief Make the given stream the current input source of MasterLexer.
    ///
    /// The caller still holds the ownership of the passed stream; it's the
    /// caller's responsibility to keep it valid as long as it's used in
    /// \c MasterLexer or to release any resource for the stream after that.
    /// The caller can explicitly tell \c MasterLexer to stop using the
102
    /// stream by calling the \c popSource() method.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
103
104
105
    ///
    /// \param input An input stream object that produces textual
    /// representation of DNS RRs.
106
    void pushSource(std::istream& input);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
107

108
109
    /// \brief Stop using the most recently opened input source (file or
    /// stream).
JINMEI Tatuya's avatar
JINMEI Tatuya committed
110
    ///
111
    /// If it's a file, the previously opened file will be closed internally.
112
    /// If it's a stream, \c MasterLexer will simply stop using
JINMEI Tatuya's avatar
JINMEI Tatuya committed
113
114
115
    /// the stream; the caller can assume it will be never used in
    /// \c MasterLexer thereafter.
    ///
116
    /// This method must not be called when there is no source pushed for
JINMEI Tatuya's avatar
JINMEI Tatuya committed
117
118
    /// \c MasterLexer.  This method is otherwise exception free.
    ///
119
120
    /// \throw isc::InvalidOperation Called with no pushed source.
    void popSource();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
121

122
    /// \brief Return the name of the current input source name.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
123
124
    ///
    /// If it's a file, it will be the C string given at the corresponding
125
    /// \c pushSource() call, that is, its filename.  If it's a stream, it will
126
127
    /// be formatted as \c "stream-%p" where \c %p is hex representation
    /// of the address of the stream object.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
128
129
130
131
132
133
134
135
136
    ///
    /// If there is no opened source at the time of the call, this method
    /// returns an empty string.
    ///
    /// \throw std::bad_alloc Resource allocation failed for string
    /// construction (rare case)
    ///
    /// \return A string representation of the current source (see the
    /// description)
137
    std::string getSourceName() const;
JINMEI Tatuya's avatar
JINMEI Tatuya committed
138
139
140
141
142
143
144
145
146
147
148
149
150
151

    /// \brief Return the input source line number.
    ///
    /// If there is an opened source, the return value will be a non-0
    /// integer indicating the line number of the current source where
    /// the \c MasterLexer is currently working.  The expected usage of
    /// this value is to print a helpful error message when parsing fails
    /// by specifically identifying the position of the error.
    ///
    /// If there is no opened source at the time of the call, this method
    /// returns 0.
    ///
    /// \throw None
    ///
152
    /// \return The current line number of the source (see the description)
153
154
155
156
157
    size_t getSourceLine() const;

private:
    struct MasterLexerImpl;
    MasterLexerImpl* impl_;
158
};
159

160
161
162
163
164
165
166
167
168
169
/// \brief Operator to combine \c MasterLexer options
///
/// This is a trivial shortcut so that compound options can be specified
/// in an intuitive way.
inline MasterLexer::Options
operator|(MasterLexer::Options o1, MasterLexer::Options o2) {
    return (static_cast<MasterLexer::Options>(
                static_cast<unsigned>(o1) | static_cast<unsigned>(o2)));
}

170
171
172
173
174
175
176
177
178
179
180
181
182
183
/// \brief Tokens for \c MasterLexer
///
/// This is a simple value-class encapsulating a type of a lexer token and
/// (if it has a value) its value.  Essentially, the class provides
/// constructors corresponding to different types of tokens, and corresponding
/// getter methods.  The type and value are fixed at the time of construction
/// and will never be modified throughout the lifetime of the object.
/// The getter methods are still provided to maximize the safety; an
/// application cannot refer to a value that is invalid for the type of token.
///
/// This class is intentionally implemented as copyable and assignable
/// (using the default version of copy constructor and assignment operator),
/// but it's mainly for internal implementation convenience.  Applications will
/// simply refer to Token object as a reference via the \c MasterLexer class.
184
185
class MasterLexer::Token {
public:
186
    /// \brief Enumeration for token types
187
188
189
190
191
    ///
    /// \note At the time of initial implementation, all numeric tokens
    /// that would be extracted from \c MasterLexer should be represented
    /// as an unsigned 32-bit integer.  If we see the need for larger integers
    /// or negative numbers, we can then extend the token types.
192
    enum Type {
193
194
        END_OF_LINE, ///< End of line detected (if asked for detecting it)
        END_OF_FILE, ///< End of file detected (if asked for detecting it)
195
196
        INITIAL_WS,  ///< White spaces at the beginning of a line after an
                     ///< end of line
197
198
199
200
201
202
203
        NOVALUE_TYPE_MAX = INITIAL_WS, ///< Max integer corresponding to
                                       /// no-value (type only) types.
                                       /// Mainly for internal use.
        STRING, ///< A single string
        QSTRING, ///< A single string quoted by double-quotes (").
        NUMBER,  ///< A decimal number (unsigned 32-bit)
        ERROR    ///< Error detected in getting a token
204
    };
205

206
    /// \brief Enumeration for lexer error codes
207
    enum ErrorCode {
208
209
210
211
212
213
214
        NOT_STARTED, ///< The lexer is just initialized and has no token
        UNBALANCED_PAREN,       ///< Unbalanced parentheses detected
        UNEXPECTED_END, ///< The lexer reaches the end of line or file
                       /// unexpectedly
        UNBALANCED_QUOTES,      ///< Unbalanced quotations detected
        MAX_ERROR_CODE ///< Max integer corresponding to valid error codes.
                       /// (excluding this one). Mainly for internal use.
215
216
    };

217
218
219
220
221
222
223
224
225
226
227
    /// \brief A simple representation of a range of a string.
    ///
    /// This is a straightforward pair of the start pointer of a string
    /// and its length.  The \c STRING and \c QSTRING types of tokens
    /// will be primarily represented in this form.
    ///
    /// Any character can be stored in the valid range of the region.
    /// In particular, there can be a nul character (\0) in the middle of
    /// the region.  On the other hand, it is not ensured that the string
    /// is nul-terminated.  So the usual string manipulation API may not work
    /// as expected.
228
    struct StringRegion {
229
230
        const char* beg;        ///< The start address of the string
        size_t len;             ///< The length of the string in bytes
231
232
    };

233
234
235
236
237
    /// \brief Constructor for non-value type of token.
    ///
    /// \throw InvalidParameter A value type token is specified.
    /// \param type The type of the token.  It must indicate a non-value
    /// type (not larger than \c NOVALUE_TYPE_MAX).
238
    explicit Token(Type type) : type_(type) {
239
        if (type > NOVALUE_TYPE_MAX) {
240
241
            isc_throw(InvalidParameter, "Token per-type constructor "
                      "called with invalid type: " << type);
242
        }
243
    }
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

    /// \brief Constructor for string and quoted-string types of token.
    ///
    /// The optional \c quoted parameter specifies whether it's a quoted or
    /// non quoted string.
    ///
    /// The string is specified as a pair of a pointer to the start address
    /// and its length.  Any character can be contained in any position of
    /// the valid range (see \c StringRegion).
    ///
    /// When it's a quoted string, the quotation marks must be excluded
    /// from the specified range.
    ///
    /// \param str_beg The start address of the string
    /// \param str_len The size of the string in bytes
    /// \param quoted true if it's a quoted string; false otherwise.
260
261
262
263
264
265
    Token(const char* str_beg, size_t str_len, bool quoted = false) :
        type_(quoted ? QSTRING : STRING)
    {
        val_.str_region_.beg = str_beg;
        val_.str_region_.len = str_len;
    }
266
267
268
269
270

    /// \brief Constructor for number type of token.
    ///
    /// \brief number An unsigned 32-bit integer corresponding to the token
    /// value.
271
272
273
    explicit Token(uint32_t number) : type_(NUMBER) {
        val_.number_ = number;
    }
274
275
276
277
278

    /// \brief Constructor for error type of token.
    ///
    /// \throw InvalidParameter Invalid error code value is specified.
    /// \brief error_code A pre-defined constant of \c ErrorCode.
279
    explicit Token(ErrorCode error_code) : type_(ERROR) {
280
        if (!(error_code < MAX_ERROR_CODE)) {
281
282
283
284
285
            isc_throw(InvalidParameter, "Invalid master lexer error code: "
                      << error_code);
        }
        val_.error_code_ = error_code;
    }
286

287
288
289
    /// \brief Return the token type.
    ///
    /// \throw none
290
    Type getType() const { return (type_); }
291
292
293
294
295
296
297

    /// \brief Return the value of a string-variant token.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \return A reference to \c StringRegion corresponding to the string
    ///         token value.
    const StringRegion& getStringRegion() const {
298
299
        if (type_ != STRING && type_ != QSTRING) {
            isc_throw(InvalidOperation,
300
                      "Token::getStringRegion() for non string-variant type");
301
        }
302
        return (val_.str_region_);
303
    }
304
305
306
307
308
309
310
311
312
313
314
315
316

    /// \brief Return the value of a string-variant token as a string object.
    ///
    /// Note that the underlying string may contain a nul (\0) character
    /// in the middle.  The returned string object will contain all characters
    /// of the valid range of the underlying string.  So some string
    /// operations such as c_str() may not work as expected.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    /// \return A std::string object corresponding to the string token value.
    std::string getString() const {
317
318
        if (type_ != STRING && type_ != QSTRING) {
            isc_throw(InvalidOperation,
319
                      "Token::getString() for non string-variant type");
320
        }
321
322
        return (std::string(val_.str_region_.beg,
                            val_.str_region_.beg + val_.str_region_.len));
323
    }
324
325
326
327
328

    /// \brief Return the value of a string-variant token as a string object.
    ///
    /// \throw InvalidOperation Called on a non number type of token.
    /// \return The integer corresponding to the number token value.
329
330
331
332
    uint32_t getNumber() const {
        if (type_ != NUMBER) {
            isc_throw(InvalidOperation,
                      "Token::getNumber() for non number type");
333
        }
334
335
        return (val_.number_);
    }
336
337
338
339
340

    /// \brief Return the error code of a error type token.
    ///
    /// \throw InvalidOperation Called on a non error type of token.
    /// \return The error code of the token.
341
342
343
344
345
346
347
    ErrorCode getErrorCode() const {
        if (type_ != ERROR) {
            isc_throw(InvalidOperation,
                      "Token::getErrorCode() for non error type");
        }
        return (val_.error_code_);
    };
348
349
350
351
352
353
354
355
356
357

    /// \brief Return a textual description of the error of a error type token.
    ///
    /// The returned string would be useful to produce a log message when
    /// a zone file parser encounters an error.
    ///
    /// \throw InvalidOperation Called on a non error type of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    /// \return A string object that describes the meaning of the error.
358
    std::string getErrorText() const;
359

360
private:
361
362
363
364
365
366
    Type type_;    // this is not const so the class can be assignable

    // We use a union to represent different types of token values via the
    // unified Token class.  The class integrity should ensure valid operation
    // on the union; getter methods should only refer to the member set at
    // the construction.
367
368
369
    union {
        StringRegion str_region_;
        uint32_t number_;
370
        ErrorCode error_code_;
371
    } val_;
372
373
374
375
376
377
378
379
380
};

} // namespace dns
} // namespace isc
#endif  // MASTER_LEXER_H

// Local Variables:
// mode: c++
// End: