update to pcre 7.9

git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@13706 d0543943-73ff-0310-b7d9-9358b9ac24b2
This commit is contained in:
Michael Jerris
2009-06-08 23:51:30 +00:00
parent a1e5add731
commit f7efdaa901
178 changed files with 43560 additions and 11382 deletions

View File

@@ -29,6 +29,10 @@
//
// Author: Sanjay Ghemawat
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
@@ -37,13 +41,11 @@
#include <errno.h>
#include <string>
#include <algorithm>
#include "config.h"
// We need this to compile the proper dll on windows/msys. This is copied
// from pcre_internal.h. It would probably be better just to include that.
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */
#include "pcrecpp_internal.h"
#include "pcre.h"
#include "pcre_stringpiece.h"
#include "pcrecpp.h"
#include "pcre_stringpiece.h"
namespace pcrecpp {
@@ -53,7 +55,23 @@ static const int kMaxArgs = 16;
static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
// Special object that stands-in for no argument
Arg no_arg((void*)NULL);
Arg RE::no_arg((void*)NULL);
// This is for ABI compatibility with old versions of pcre (pre-7.6),
// which defined a global no_arg variable instead of putting it in the
// RE class. This works on GCC >= 3, at least. It definitely works
// for ELF, but may not for other object formats (Mach-O, for
// instance, does not support aliases.) We could probably have a more
// inclusive test if we ever needed it. (Note that not only the
// __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
// gnu-specific.)
#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
# define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x)
# define ULP_AS_STRING_INTERNAL(x) #x
# define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__)
extern Arg no_arg
__attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
#endif
// If a regular expression has no error, its error_ field points here
static const string empty_string;
@@ -61,7 +79,7 @@ static const string empty_string;
// If the user doesn't ask for any options, we just use this one
static RE_Options default_options;
void RE::Init(const char* pat, const RE_Options* options) {
void RE::Init(const string& pat, const RE_Options* options) {
pattern_ = pat;
if (options == NULL) {
options_ = default_options;
@@ -74,27 +92,22 @@ void RE::Init(const char* pat, const RE_Options* options) {
re_partial_ = Compile(UNANCHORED);
if (re_partial_ != NULL) {
// Check for complicated patterns. The following change is
// conservative in that it may treat some "simple" patterns
// as "complex" (e.g., if the vertical bar is in a character
// class or is escaped). But it seems good enough.
if (strchr(pat, '|') == NULL) {
// Simple pattern: we can use position-based checks to perform
// fully anchored matches
re_full_ = re_partial_;
} else {
// We need a special pattern for anchored matches
re_full_ = Compile(ANCHOR_BOTH);
}
re_full_ = Compile(ANCHOR_BOTH);
}
}
RE::~RE() {
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);
if (re_partial_ != NULL) (*pcre_free)(re_partial_);
if (error_ != &empty_string) delete error_;
void RE::Cleanup() {
if (re_full_ != NULL) (*pcre_free)(re_full_);
if (re_partial_ != NULL) (*pcre_free)(re_partial_);
if (error_ != &empty_string) delete error_;
}
RE::~RE() {
Cleanup();
}
pcre* RE::Compile(Anchor anchor) {
// First, convert RE_Options into pcre options
int pcre_options = 0;
@@ -334,13 +347,17 @@ bool RE::Replace(const StringPiece& rewrite,
// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
static int NewlineMode(int pcre_options) {
// TODO: if we can make it threadsafe, cache this var
int newline_mode = 0;
/* if (newline_mode) return newline_mode; */ // do this once it's cached
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)) {
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
newline_mode = (pcre_options &
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF));
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
} else {
int newline;
pcre_config(PCRE_CONFIG_NEWLINE, &newline);
@@ -350,8 +367,12 @@ static int NewlineMode(int pcre_options) {
newline_mode = PCRE_NEWLINE_CR;
else if (newline == 3338)
newline_mode = PCRE_NEWLINE_CRLF;
else if (newline == -1)
newline_mode = PCRE_NEWLINE_ANY;
else if (newline == -2)
newline_mode = PCRE_NEWLINE_ANYCRLF;
else
assert("" == "Unexpected return value from pcre_config(NEWLINE)");
assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
}
return newline_mode;
}
@@ -364,7 +385,7 @@ int RE::GlobalReplace(const StringPiece& rewrite,
int start = 0;
int lastend = -1;
for (; start <= static_cast<int>(str->length()); count++) {
while (start <= static_cast<int>(str->length())) {
int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
if (matches <= 0)
break;
@@ -379,9 +400,13 @@ int RE::GlobalReplace(const StringPiece& rewrite,
// Note it's better to call pcre_fullinfo() than to examine
// all_options(), since options_ could have changed bewteen
// compile-time and now, but this is simpler and safe enough.
// Modified by PH to add ANY and ANYCRLF.
if (start+1 < static_cast<int>(str->length()) &&
(*str)[start] == '\r' && (*str)[start+1] == '\n' &&
NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF) {
(NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
) {
matchend++;
}
// We also need to advance more than one char if we're in utf8 mode.
@@ -424,6 +449,40 @@ bool RE::Extract(const StringPiece& rewrite,
return Rewrite(out, rewrite, text, vec, matches);
}
/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
string result;
// Escape any ascii character not in [A-Za-z_0-9].
//
// Note that it's legal to escape a character even if it has no
// special meaning in a regular expression -- so this function does
// that. (This also makes it identical to the perl function of the
// same name; see `perldoc -f quotemeta`.) The one exception is
// escaping NUL: rather than doing backslash + NUL, like perl does,
// we do '\0', because pcre itself doesn't take embedded NUL chars.
for (int ii = 0; ii < unquoted.size(); ++ii) {
// Note that using 'isalnum' here raises the benchmark time from
// 32ns to 58ns:
if (unquoted[ii] == '\0') {
result += "\\0";
} else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
(unquoted[ii] < '0' || unquoted[ii] > '9') &&
unquoted[ii] != '_' &&
// If this is the part of a UTF8 or Latin1 character, we need
// to copy this byte without escaping. Experimentally this is
// what works correctly with the regexp library.
!(unquoted[ii] & 128)) {
result += '\\';
result += unquoted[ii];
} else {
result += unquoted[ii];
}
}
return result;
}
/***** Actual matching and rewriting code *****/
int RE::TryMatch(const StringPiece& text,
@@ -437,7 +496,7 @@ int RE::TryMatch(const StringPiece& text,
return 0;
}
pcre_extra extra = { 0 };
pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
if (options_.match_limit() > 0) {
extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
extra.match_limit = options_.match_limit();
@@ -470,13 +529,6 @@ int RE::TryMatch(const StringPiece& text,
rc = vecsize / 2;
}
if ((anchor == ANCHOR_BOTH) && (re_full_ == re_partial_)) {
// We need an extra check to make sure that the match extended
// to the end of the input string
assert(vec[0] == 0); // PCRE_ANCHORED forces starting match
if (vec[1] != text.size()) return 0; // Did not get ending match
}
return rc;
}
@@ -553,14 +605,14 @@ bool RE::Rewrite(string *out, const StringPiece &rewrite,
if (start >= 0)
out->append(text.data() + start, vec[2 * n + 1] - start);
} else if (c == '\\') {
out->push_back('\\');
*out += '\\';
} else {
//fprintf(stderr, "invalid rewrite pattern: %.*s\n",
// rewrite.size(), rewrite.data());
return false;
}
} else {
out->push_back(c);
*out += c;
}
}
return true;
@@ -588,23 +640,27 @@ bool Arg::parse_null(const char* str, int n, void* dest) {
}
bool Arg::parse_string(const char* str, int n, void* dest) {
if (dest == NULL) return true;
reinterpret_cast<string*>(dest)->assign(str, n);
return true;
}
bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
if (dest == NULL) return true;
reinterpret_cast<StringPiece*>(dest)->set(str, n);
return true;
}
bool Arg::parse_char(const char* str, int n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<char*>(dest)) = str[0];
return true;
}
bool Arg::parse_uchar(const char* str, int n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
return true;
}
@@ -653,6 +709,7 @@ bool Arg::parse_long_radix(const char* str,
long r = strtol(str, &end, radix);
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<long*>(dest)) = r;
return true;
}
@@ -670,6 +727,7 @@ bool Arg::parse_ulong_radix(const char* str,
unsigned long r = strtoul(str, &end, radix);
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned long*>(dest)) = r;
return true;
}
@@ -681,7 +739,8 @@ bool Arg::parse_short_radix(const char* str,
long r;
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
*(reinterpret_cast<short*>(dest)) = r;
if (dest == NULL) return true;
*(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
return true;
}
@@ -692,7 +751,8 @@ bool Arg::parse_ushort_radix(const char* str,
unsigned long r;
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if (r > USHRT_MAX) return false; // Out of range
*(reinterpret_cast<unsigned short*>(dest)) = r;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
return true;
}
@@ -703,6 +763,7 @@ bool Arg::parse_int_radix(const char* str,
long r;
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if (r < INT_MIN || r > INT_MAX) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<int*>(dest)) = r;
return true;
}
@@ -714,6 +775,7 @@ bool Arg::parse_uint_radix(const char* str,
unsigned long r;
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if (r > UINT_MAX) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<unsigned int*>(dest)) = r;
return true;
}
@@ -734,11 +796,14 @@ bool Arg::parse_longlong_radix(const char* str,
long long r = strtoq(str, &end, radix);
#elif defined HAVE_STRTOLL
long long r = strtoll(str, &end, radix);
#elif defined HAVE__STRTOI64
long long r = _strtoi64(str, &end, radix);
#else
#error parse_longlong_radix: cannot convert input to a long-long
#endif
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<long long*>(dest)) = r;
return true;
#endif /* HAVE_LONG_LONG */
@@ -761,11 +826,14 @@ bool Arg::parse_ulonglong_radix(const char* str,
unsigned long long r = strtouq(str, &end, radix);
#elif defined HAVE_STRTOLL
unsigned long long r = strtoull(str, &end, radix);
#elif defined HAVE__STRTOI64
unsigned long long r = _strtoui64(str, &end, radix);
#else
#error parse_ulonglong_radix: cannot convert input to a long-long
#endif
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned long long*>(dest)) = r;
return true;
#endif /* HAVE_UNSIGNED_LONG_LONG */
@@ -783,6 +851,7 @@ bool Arg::parse_double(const char* str, int n, void* dest) {
double r = strtod(buf, &end);
if (end != buf + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<double*>(dest)) = r;
return true;
}
@@ -790,6 +859,7 @@ bool Arg::parse_double(const char* str, int n, void* dest) {
bool Arg::parse_float(const char* str, int n, void* dest) {
double r;
if (!parse_double(str, n, &r)) return false;
if (dest == NULL) return true;
*(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
return true;
}
@@ -809,14 +879,14 @@ bool Arg::parse_float(const char* str, int n, void* dest) {
return parse_##name##_radix(str, n, dest, 0); \
}
DEFINE_INTEGER_PARSERS(short);
DEFINE_INTEGER_PARSERS(ushort);
DEFINE_INTEGER_PARSERS(int);
DEFINE_INTEGER_PARSERS(uint);
DEFINE_INTEGER_PARSERS(long);
DEFINE_INTEGER_PARSERS(ulong);
DEFINE_INTEGER_PARSERS(longlong);
DEFINE_INTEGER_PARSERS(ulonglong);
DEFINE_INTEGER_PARSERS(short) /* */
DEFINE_INTEGER_PARSERS(ushort) /* */
DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
DEFINE_INTEGER_PARSERS(longlong) /* */
DEFINE_INTEGER_PARSERS(ulonglong) /* */
#undef DEFINE_INTEGER_PARSERS