Commit 39ab2f5d authored by Thomas Markwalder's avatar Thomas Markwalder Committed by Tomek Mrugalski

[5680] Added sanitizeString() function to libutil

configure.ac
    Added logic to detect usable C++11 regex

src/lib/util/strutil.*
    sanitizeString() - new function that replaces all
    occurances of invalid chars in a string with a
    specified replacement

src/lib/util/tests/strutil_unittest.cc
    TEST(StringUtilTest, sanitizeString) - new test
parent 903e9f39
......@@ -571,6 +571,20 @@ AC_TRY_COMPILE([
AC_DEFINE(HAVE_SA_LEN, 1, [Define to 1 if sockaddr has a sa_len member, and corresponding sin_len and sun_len])],
AC_MSG_RESULT(no))
AC_MSG_CHECKING(for usuable C++11 regex)
AC_TRY_RUN([
#include <regex>
#include <iostream>
int main() {
const std::regex regex(".*");
const std::string string = "This should match!";
const auto result = std::regex_search(string, regex);
return result ? EXIT_SUCCESS : EXIT_FAILURE;
}],
[AC_MSG_RESULT(yes)
AC_DEFINE(USE_REGEX, 1, [Define to 1 if C++11 regex is usable])],
AC_MSG_RESULT(no))
enable_gtest="no"
GTEST_INCLUDES=
......
// Copyright (C) 2011-2017 Internet Systems Consortium, Inc. ("ISC")
// Copyright (C) 2011-2018 Internet Systems Consortium, Inc. ("ISC")
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
......@@ -14,9 +14,20 @@
#include <boost/algorithm/string/split.hpp>
#include <numeric>
#include <iostream>
#include <sstream>
#include <string.h>
// Early versions of C++11 regex were buggy, use it if we
// can otherwise, we fall back to regcomp/regexec. For more info see:
// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
#ifdef USE_REGEX
#include <regex>
#else
#include <sys/types.h>
#include <regex.h>
#endif
#include <string.h>
using namespace std;
......@@ -288,6 +299,81 @@ decodeFormattedHexString(const std::string& hex_string,
}
}
std::string
sanitizeString(const std::string& original,
const std::string& invalidChars,
const std::string& replacement) {
#ifdef USE_REGEX
std::regex rexpr;
try {
rexpr = std::regex(invalidChars, std::regex::extended);
} catch (const std::exception& ex) {
isc_throw(isc::BadValue, "invalid regex: '"
<< invalidChars << "', " << ex.what());
}
std::stringstream result;
try {
std::regex_replace(std::ostream_iterator<char>(result),
original.begin(), original.end(),
rexpr, replacement);
} catch (const std::exception& ex) {
isc_throw(isc::BadValue, "replacing '" << invalidChars << "' with '"
<< replacement << "' in '" << original << "' failed: ,"
<< ex.what());
}
return (result.str());
#else
// Compile the expression.
regex_t rex;
int ec = regcomp(&rex, invalidChars.c_str(), REG_EXTENDED);
if (ec) {
char errbuf[512] = "";
static_cast<void>(regerror(ec, &rex, errbuf, sizeof(errbuf)));
isc_throw(isc::BadValue, "invalid regex: '" << invalidChars
<< "', " << errbuf);
}
// Iterate over original string, match by match.
const char* origStr = original.c_str();
const char* startFrom = origStr;
const char* endAt = origStr + strlen(origStr);
regmatch_t matches[2]; // n matches + 1
stringstream result;
while (startFrom < endAt) {
// Look for the next match
if (regexec(&rex, startFrom, 1, matches, 0) == REG_NOMATCH) {
// No matches, so add in the remainder
result << startFrom;
break;
}
// Shouldn't happen, but one never knows eh?
if (matches[0].rm_so == -1) {
isc_throw(isc::Unexpected, "matched but so is -1?");
}
// Add everything from starting point up to the current match
const char* matchAt = startFrom + matches[0].rm_so;
while (startFrom < matchAt) {
result << *startFrom;
++startFrom;
}
// Add in the replacement
result << replacement;
// Move past the match.
++startFrom;
}
regfree(&rex);
return (result.str());
#endif
}
} // namespace str
} // namespace util
} // namespace isc
......@@ -255,6 +255,25 @@ void
decodeFormattedHexString(const std::string& hex_string,
std::vector<uint8_t>& binary);
/// \brief Replaces all occurences of a character set in a string
///
/// This function runs a given string through a regular expression,
/// replacing all "matches" of that expression with the specified string.
///
/// \param original the string to sanitize
/// \param invalidChars string containing a regular expression (POSIX
/// extended syntax) that describes the characters to replace. If you
/// wanted to sanitize hostnames for example, you could specify the
/// inversion of valid characters "[^A-Za-z0-9_-]".
/// \param replacement string of one or more characters to use as the
/// replacement for invalid characters.
/// \return the new, sanitized string
/// \throw BadValue if given an invalid regular expression, Unexpected if
/// an error occurs executing the expression
std::string
sanitizeString(const std::string& original,
const std::string& invalidChars,
const std::string& replacement);
} // namespace str
} // namespace util
......
// Copyright (C) 2011-2017 Internet Systems Consortium, Inc. ("ISC")
// Copyright (C) 2011-2018 Internet Systems Consortium, Inc. ("ISC")
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
......@@ -463,4 +463,37 @@ TEST(StringUtilTest, decodeFormattedHexString) {
isc::BadValue);
}
// Verifies sanitizeString() function
TEST(StringUtilTest, sanitizeString) {
std::string sanitized;
// Bad regular expression should throw.
ASSERT_THROW (sanitized = sanitizeString("just a string", "[bogus-regex",""),
BadValue);
// A string of all valid chars should return an identical string.
ASSERT_NO_THROW (sanitized = sanitizeString("-_A--B__Cabc34567_-", "[^A-Ca-c3-7_-]","x"));
EXPECT_EQ(sanitized, "-_A--B__Cabc34567_-");
// Replacing with a character should work.
ASSERT_NO_THROW (sanitized = sanitizeString("A[b]c\12JoE3-_x!B$Y#e", "[^A-Za-z0-9_]","*"));
EXPECT_EQ(sanitized, "A*b*c*JoE3*_x*B*Y*e");
// Removing (i.e.replacing with an "empty" string) should work.
ASSERT_NO_THROW (sanitized = sanitizeString("A[b]c\12JoE3-_x!B$Y#e", "[^A-Za-z0-9_]",""));
EXPECT_EQ(sanitized, "AbcJoE3_xBYe");
// More than one non-matching in a row should work.
ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","x"));
EXPECT_EQ(sanitized, "xxAxxBxxCxx");
// Removing than one non-matching in a row should work.
ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]",""));
EXPECT_EQ(sanitized, "ABC");
// Replacing with a string should work.
ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","xyz"));
EXPECT_EQ(sanitized, "xyzxyzAxyzxyzBxyzxyzCxyzxyz");
}
} // end of anonymous namespace
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment