/* * This file is part of the Sofia-SIP package * * Copyright (C) 2005 Nokia Corporation. * * Contact: Pekka Pessi * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA * */ /** * @file su_bm.c * @brief Search with Boyer-Moore algorithm * * @author Pekka Pessi * * @date Created: Mon Apr 11 16:35:16 2005 ppessi * */ #include "config.h" #include #include #include #include #include #include #include #ifndef TORTURELOG #define TORTURELOG(x) (void)0 #endif struct bw_fwd_table { unsigned char table[UCHAR_MAX + 1]; }; /** Build forward skip table #bm_fwd_table_t for Boyer-Moore algorithm. */ static bm_fwd_table_t * bm_memmem_study0(char const *needle, size_t nlen, bm_fwd_table_t *fwd) { size_t i; if (nlen >= UCHAR_MAX) { needle += nlen - UCHAR_MAX; nlen = UCHAR_MAX; } memset(&fwd->table, (unsigned char)nlen, sizeof fwd->table); for (i = 0; i < nlen; i++) { fwd->table[(unsigned short)needle[i]] = (unsigned char)(nlen - i - 1); } return fwd; } /** @defgroup su_bm Fast string searching with Boyer-Moore algorithm * * The Boyer-Moore algorithm is used to implement fast substring search. The * algorithm has some overhead caused by filling a table. Substring search * then requires at most 1 / substring-length less string comparisons. On * modern desktop hardware, Boyer-Moore algorithm is seldom faster than the * naive implementation if the searched substring is shorter than the cache * line. * */ /**@ingroup su_bm * @typedef struct bw_fwd_table bm_fwd_table_t; * * Forward skip table for Boyer-Moore algorithm. * */ /** Build case-sensitive forward skip table #bm_fwd_table_t * for Boyer-Moore algorithm. * @ingroup su_bm */ bm_fwd_table_t * bm_memmem_study(char const *needle, size_t nlen) { bm_fwd_table_t *fwd = malloc(sizeof *fwd); if (fwd) bm_memmem_study0(needle, nlen, fwd); return fwd; } /** Search for a substring using Boyer-Moore algorithm. * @ingroup su_bm */ char * bm_memmem(char const *haystack, size_t hlen, char const *needle, size_t nlen, bm_fwd_table_t *fwd) { size_t i, j; bm_fwd_table_t fwd0[1]; if (nlen == 0) return (char *)haystack; if (needle == NULL || haystack == NULL || nlen > hlen) return NULL; if (nlen == 1) { for (i = 0; i < hlen; i++) if (haystack[i] == needle[0]) return (char *)haystack + i; return NULL; } if (!fwd) fwd = bm_memmem_study0(needle, nlen, fwd0); for (i = j = nlen - 1; i < hlen;) { unsigned char h = haystack[i]; if (h == needle[j]) { TORTURELOG(("match \"%s\" at %u\nwith %*s\"%.*s*%s\": %s\n", haystack, (unsigned)i, (int)(i - j), "", (int)j, needle, needle + j + 1, j == 0 ? "match!" : "back by 1")); if (j == 0) return (char *)haystack + i; i--, j--; } else { if (fwd->table[h] > nlen - j) { TORTURELOG(("match \"%s\" at %u\n" "last %*s\"%.*s*%s\": (by %u)\n", haystack, (unsigned)i, (int)(i - j), "", (int)j, needle, needle + j + 1, fwd->table[h])); i += fwd->table[h]; } else { TORTURELOG(("match \"%s\" at %u\n" "2nd %*s\"%.*s*%s\": (by %u)\n", haystack, (unsigned)i, (int)(i - j), "", (int)j, needle, needle + j + 1, (unsigned)(nlen - j))); i += nlen - j; } j = nlen - 1; } } return NULL; } /** Build forward skip table for Boyer-Moore algorithm */ static bm_fwd_table_t * bm_memcasemem_study0(char const *needle, size_t nlen, bm_fwd_table_t *fwd) { size_t i; if (nlen >= UCHAR_MAX) { needle += nlen - UCHAR_MAX; nlen = UCHAR_MAX; } for (i = 0; i < UCHAR_MAX; i++) fwd->table[i] = (unsigned char)nlen; for (i = 0; i < nlen; i++) { unsigned char n = tolower(needle[i]); fwd->table[n] = (unsigned char)(nlen - i - 1); } return fwd; } /** Build case-insensitive forward skip table for Boyer-Moore algorithm. * @ingroup su_bm */ bm_fwd_table_t * bm_memcasemem_study(char const *needle, size_t nlen) { bm_fwd_table_t *fwd = malloc(sizeof *fwd); if (fwd) bm_memcasemem_study0(needle, nlen, fwd); return fwd; } /** Search for substring using Boyer-Moore algorithm. * @ingroup su_bm */ char * bm_memcasemem(char const *haystack, size_t hlen, char const *needle, size_t nlen, bm_fwd_table_t *fwd) { size_t i, j; bm_fwd_table_t fwd0[1]; if (nlen == 0) return (char *)haystack; if (needle == 0 || haystack == 0 || nlen > hlen) return NULL; if (nlen == 1) { for (i = 0; i < hlen; i++) if (haystack[i] == needle[0]) return (char *)haystack + i; return NULL; } if (!fwd) { fwd = bm_memcasemem_study0(needle, nlen, fwd0); } for (i = j = nlen - 1; i < hlen;) { unsigned char h = haystack[i], n = needle[j]; if (isupper(h)) h = tolower(h); if (isupper(n)) n = tolower(n); if (h == n) { TORTURELOG(("match \"%s\" at %u\n" "with %*s\"%.*s*%s\": %s\n", haystack, (unsigned)i, (int)(i - j), "", (int)j, needle, needle + j + 1, j == 0 ? "match!" : "back by 1")); if (j == 0) return (char *)haystack + i; i--, j--; } else { if (fwd->table[h] > nlen - j) { TORTURELOG(("match \"%s\" at %u\n" "last %*s\"%.*s*%s\": (by %u)\n", haystack, (unsigned)i, (int)(i - j), "", (int)j, needle, needle + j + 1, fwd->table[h])); i += fwd->table[h]; } else { TORTURELOG(("match \"%s\" at %u\n" "2nd %*s\"%.*s*%s\": (by %u)\n", haystack, (unsigned)i, (int)(i - j), "", (int)j, needle, needle + j + 1, (unsigned)(nlen - j))); i += nlen - j; } j = nlen - 1; } } return NULL; }