master_lexer.h 13.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Copyright (C) 2012  Internet Systems Consortium, Inc. ("ISC")
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
// REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
// AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
// LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THIS SOFTWARE.

#ifndef MASTER_LEXER_H
#define MASTER_LEXER_H 1

#include <exceptions/exceptions.h>

20
#include <istream>
21
22
23
24
25
26
27
#include <string>

#include <stdint.h>

namespace isc {
namespace dns {

JINMEI Tatuya's avatar
JINMEI Tatuya committed
28
29
30
31
/// \brief Tokenizer for parsing DNS master files.
///
/// The \c MasterLexer class provides tokenize interfaces for parsing DNS
/// master files.  It understands some special rules of master files as
32
/// defined in RFC 1035, such as comments, character escaping, and multi-line
JINMEI Tatuya's avatar
JINMEI Tatuya committed
33
34
35
36
37
38
39
40
41
42
43
44
/// data, and provides the user application with the actual data in a
/// more convenient form such as a std::string object.
///
/// In order to support the $INCLUDE notation, this class is designed to be
/// able to operate on multiple files or input streams in the nested way.
/// The \c open() and \c close() methods correspond to the push and pop
/// operations.
///
/// While this class is public, it is less likely to be used by normal
/// applications; it's mainly expected to be used within this library,
/// specifically by the \c MasterLoader class and \c Rdata implementation
/// classes.
45
46
class MasterLexer {
public:
JINMEI Tatuya's avatar
JINMEI Tatuya committed
47
    class Token;       // we define it separately for better readability
48

JINMEI Tatuya's avatar
JINMEI Tatuya committed
49
50
51
    /// \brief The constructor.
    ///
    /// \throw std::bad_alloc Internal resource allocation fails (rare case).
52
    MasterLexer();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
53
54
55
56

    /// \brief The destructor.
    ///
    /// It internally closes any remaining input sources.
57
    ~MasterLexer();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
58
59
60
61
62
63
64
65
66
67

    /// \brief Open a file and make it the current input source of MasterLexer.
    ///
    /// The opened file can be explicitly closed by the \c close() method;
    /// if \c close() is not called within the lifetime of the \c MasterLexer,
    /// it will be closed in the destructor.
    ///
    /// \throw InvalidParameter filename is NULL
    /// \throw some_other The specified cannot be opened
    /// \param filename A non NULL string specifying a master file
68
    void open(const char* filename);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
69
70
71
72
73
74
75
76
77
78
79

    /// \brief Make the given stream the current input source of MasterLexer.
    ///
    /// The caller still holds the ownership of the passed stream; it's the
    /// caller's responsibility to keep it valid as long as it's used in
    /// \c MasterLexer or to release any resource for the stream after that.
    /// The caller can explicitly tell \c MasterLexer to stop using the
    /// stream by calling the \c close() method.
    ///
    /// \param input An input stream object that produces textual
    /// representation of DNS RRs.
80
    void open(std::istream& input);
JINMEI Tatuya's avatar
JINMEI Tatuya committed
81
82
83
84

    /// \brief Close the most recently opened input source (file or stream).
    ///
    /// If it's a file, the opened file will be literally closed.
85
    /// If it's a stream, \c MasterLexer will simply stop using
JINMEI Tatuya's avatar
JINMEI Tatuya committed
86
87
88
89
90
91
92
    /// the stream; the caller can assume it will be never used in
    /// \c MasterLexer thereafter.
    ///
    /// This method must not be called when there is no opened source for
    /// \c MasterLexer.  This method is otherwise exception free.
    ///
    /// \throw isc::InvalidOperation Called with no opened source.
93
    void close();
JINMEI Tatuya's avatar
JINMEI Tatuya committed
94

95
    /// \brief Return the name of the current input source name.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
96
97
    ///
    /// If it's a file, it will be the C string given at the corresponding
98
99
100
    /// \c open() call, that is, its filename.  If it's a stream, it will
    /// be formatted as \c "stream-%p" where \c %p is hex representation
    /// of the address of the stream object.
JINMEI Tatuya's avatar
JINMEI Tatuya committed
101
102
103
104
105
106
107
108
109
    ///
    /// If there is no opened source at the time of the call, this method
    /// returns an empty string.
    ///
    /// \throw std::bad_alloc Resource allocation failed for string
    /// construction (rare case)
    ///
    /// \return A string representation of the current source (see the
    /// description)
110
    std::string getSourceName() const;
JINMEI Tatuya's avatar
JINMEI Tatuya committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

    /// \brief Return the input source line number.
    ///
    /// If there is an opened source, the return value will be a non-0
    /// integer indicating the line number of the current source where
    /// the \c MasterLexer is currently working.  The expected usage of
    /// this value is to print a helpful error message when parsing fails
    /// by specifically identifying the position of the error.
    ///
    /// If there is no opened source at the time of the call, this method
    /// returns 0.
    ///
    /// \throw None
    ///
    /// \return A string representation of the current source (see the
    /// description)
127
128
129
130
131
    size_t getSourceLine() const;

private:
    struct MasterLexerImpl;
    MasterLexerImpl* impl_;
132
};
133

134
135
136
137
138
139
140
141
142
143
144
145
146
147
/// \brief Tokens for \c MasterLexer
///
/// This is a simple value-class encapsulating a type of a lexer token and
/// (if it has a value) its value.  Essentially, the class provides
/// constructors corresponding to different types of tokens, and corresponding
/// getter methods.  The type and value are fixed at the time of construction
/// and will never be modified throughout the lifetime of the object.
/// The getter methods are still provided to maximize the safety; an
/// application cannot refer to a value that is invalid for the type of token.
///
/// This class is intentionally implemented as copyable and assignable
/// (using the default version of copy constructor and assignment operator),
/// but it's mainly for internal implementation convenience.  Applications will
/// simply refer to Token object as a reference via the \c MasterLexer class.
148
149
class MasterLexer::Token {
public:
150
    /// \brief Enumeration for token types
151
152
153
154
155
    ///
    /// \note At the time of initial implementation, all numeric tokens
    /// that would be extracted from \c MasterLexer should be represented
    /// as an unsigned 32-bit integer.  If we see the need for larger integers
    /// or negative numbers, we can then extend the token types.
156
    enum Type {
157
158
159
160
161
162
163
164
165
166
        END_OF_LINE, ///< End of line detected (if asked for detecting it)
        END_OF_FILE, ///< End of file detected (if asked for detecting it)
        INITIAL_WS,  ///< White spaces at the beginning of a line
        NOVALUE_TYPE_MAX = INITIAL_WS, ///< Max integer corresponding to
                                       /// no-value (type only) types.
                                       /// Mainly for internal use.
        STRING, ///< A single string
        QSTRING, ///< A single string quoted by double-quotes (").
        NUMBER,  ///< A decimal number (unsigned 32-bit)
        ERROR    ///< Error detected in getting a token
167
    };
168

169
    /// \brief Enumeration for lexer error codes
170
    enum ErrorCode {
171
172
173
174
175
176
177
        NOT_STARTED, ///< The lexer is just initialized and has no token
        UNBALANCED_PAREN,       ///< Unbalanced parentheses detected
        UNEXPECTED_END, ///< The lexer reaches the end of line or file
                       /// unexpectedly
        UNBALANCED_QUOTES,      ///< Unbalanced quotations detected
        MAX_ERROR_CODE ///< Max integer corresponding to valid error codes.
                       /// (excluding this one). Mainly for internal use.
178
179
    };

180
181
182
183
184
185
186
187
188
189
190
    /// \brief A simple representation of a range of a string.
    ///
    /// This is a straightforward pair of the start pointer of a string
    /// and its length.  The \c STRING and \c QSTRING types of tokens
    /// will be primarily represented in this form.
    ///
    /// Any character can be stored in the valid range of the region.
    /// In particular, there can be a nul character (\0) in the middle of
    /// the region.  On the other hand, it is not ensured that the string
    /// is nul-terminated.  So the usual string manipulation API may not work
    /// as expected.
191
    struct StringRegion {
192
193
        const char* beg;        ///< The start address of the string
        size_t len;             ///< The length of the string in bytes
194
195
    };

196
197
198
199
200
    /// \brief Constructor for non-value type of token.
    ///
    /// \throw InvalidParameter A value type token is specified.
    /// \param type The type of the token.  It must indicate a non-value
    /// type (not larger than \c NOVALUE_TYPE_MAX).
201
    explicit Token(Type type) : type_(type) {
202
        if (type > NOVALUE_TYPE_MAX) {
203
204
            isc_throw(InvalidParameter, "Token per-type constructor "
                      "called with invalid type: " << type);
205
        }
206
    }
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

    /// \brief Constructor for string and quoted-string types of token.
    ///
    /// The optional \c quoted parameter specifies whether it's a quoted or
    /// non quoted string.
    ///
    /// The string is specified as a pair of a pointer to the start address
    /// and its length.  Any character can be contained in any position of
    /// the valid range (see \c StringRegion).
    ///
    /// When it's a quoted string, the quotation marks must be excluded
    /// from the specified range.
    ///
    /// \param str_beg The start address of the string
    /// \param str_len The size of the string in bytes
    /// \param quoted true if it's a quoted string; false otherwise.
223
224
225
226
227
228
    Token(const char* str_beg, size_t str_len, bool quoted = false) :
        type_(quoted ? QSTRING : STRING)
    {
        val_.str_region_.beg = str_beg;
        val_.str_region_.len = str_len;
    }
229
230
231
232
233

    /// \brief Constructor for number type of token.
    ///
    /// \brief number An unsigned 32-bit integer corresponding to the token
    /// value.
234
235
236
    explicit Token(uint32_t number) : type_(NUMBER) {
        val_.number_ = number;
    }
237
238
239
240
241

    /// \brief Constructor for error type of token.
    ///
    /// \throw InvalidParameter Invalid error code value is specified.
    /// \brief error_code A pre-defined constant of \c ErrorCode.
242
    explicit Token(ErrorCode error_code) : type_(ERROR) {
243
        if (!(error_code < MAX_ERROR_CODE)) {
244
245
246
247
248
            isc_throw(InvalidParameter, "Invalid master lexer error code: "
                      << error_code);
        }
        val_.error_code_ = error_code;
    }
249

250
251
252
    /// \brief Return the token type.
    ///
    /// \throw none
253
    Type getType() const { return (type_); }
254
255
256
257
258
259
260

    /// \brief Return the value of a string-variant token.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \return A reference to \c StringRegion corresponding to the string
    ///         token value.
    const StringRegion& getStringRegion() const {
261
262
        if (type_ != STRING && type_ != QSTRING) {
            isc_throw(InvalidOperation,
263
                      "Token::getStringRegion() for non string-variant type");
264
        }
265
        return (val_.str_region_);
266
    }
267
268
269
270
271
272
273
274
275
276
277
278
279

    /// \brief Return the value of a string-variant token as a string object.
    ///
    /// Note that the underlying string may contain a nul (\0) character
    /// in the middle.  The returned string object will contain all characters
    /// of the valid range of the underlying string.  So some string
    /// operations such as c_str() may not work as expected.
    ///
    /// \throw InvalidOperation Called on a non string-variant types of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    /// \return A std::string object corresponding to the string token value.
    std::string getString() const {
280
281
        if (type_ != STRING && type_ != QSTRING) {
            isc_throw(InvalidOperation,
282
                      "Token::getString() for non string-variant type");
283
        }
284
285
        return (std::string(val_.str_region_.beg,
                            val_.str_region_.beg + val_.str_region_.len));
286
    }
287
288
289
290
291

    /// \brief Return the value of a string-variant token as a string object.
    ///
    /// \throw InvalidOperation Called on a non number type of token.
    /// \return The integer corresponding to the number token value.
292
293
294
295
    uint32_t getNumber() const {
        if (type_ != NUMBER) {
            isc_throw(InvalidOperation,
                      "Token::getNumber() for non number type");
296
        }
297
298
        return (val_.number_);
    }
299
300
301
302
303

    /// \brief Return the error code of a error type token.
    ///
    /// \throw InvalidOperation Called on a non error type of token.
    /// \return The error code of the token.
304
305
306
307
308
309
310
    ErrorCode getErrorCode() const {
        if (type_ != ERROR) {
            isc_throw(InvalidOperation,
                      "Token::getErrorCode() for non error type");
        }
        return (val_.error_code_);
    };
311
312
313
314
315
316
317
318
319
320

    /// \brief Return a textual description of the error of a error type token.
    ///
    /// The returned string would be useful to produce a log message when
    /// a zone file parser encounters an error.
    ///
    /// \throw InvalidOperation Called on a non error type of token.
    /// \throw std::bad_alloc Resource allocation failure in constructing the
    ///                       string object.
    /// \return A string object that describes the meaning of the error.
321
    std::string getErrorText() const;
322

323
private:
324
325
326
327
328
329
    Type type_;    // this is not const so the class can be assignable

    // We use a union to represent different types of token values via the
    // unified Token class.  The class integrity should ensure valid operation
    // on the union; getter methods should only refer to the member set at
    // the construction.
330
331
332
    union {
        StringRegion str_region_;
        uint32_t number_;
333
        ErrorCode error_code_;
334
    } val_;
335
336
337
338
339
340
341
342
343
};

} // namespace dns
} // namespace isc
#endif  // MASTER_LEXER_H

// Local Variables:
// mode: c++
// End: