BIND9 Internals: lib/isc/regex.c Source File

00001 /*
00002  * Copyright (C) 2013, 2014  Internet Systems Consortium, Inc. ("ISC")
00003  *
00004  * Permission to use, copy, modify, and/or distribute this software for any
00005  * purpose with or without fee is hereby granted, provided that the above
00006  * copyright notice and this permission notice appear in all copies.
00007  *
00008  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
00009  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
00010  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
00011  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
00012  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
00013  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
00014  * PERFORMANCE OF THIS SOFTWARE.
00015  */
00016 
00017 #include <config.h>
00018 
00019 #include <isc/file.h>
00020 #include <isc/regex.h>
00021 #include <isc/string.h>
00022 
00023 #if VALREGEX_REPORT_REASON
00024 #define FAIL(x) do { reason = (x); goto error; } while(0)
00025 #else
00026 #define FAIL(x) goto error
00027 #endif
00028 
00029 /*
00030  * Validate the regular expression 'C' locale.
00031  */
00032 int
00033 isc_regex_validate(const char *c) {
00034         enum {
00035                 none, parse_bracket, parse_bound,
00036                 parse_ce, parse_ec, parse_cc
00037         } state = none;
00038         /* Well known character classes. */
00039         const char *cc[] = {
00040                 ":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
00041                 ":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
00042                 ":print:", ":xdigit:"
00043         };
00044         isc_boolean_t seen_comma = ISC_FALSE;
00045         isc_boolean_t seen_high = ISC_FALSE;
00046         isc_boolean_t seen_char = ISC_FALSE;
00047         isc_boolean_t seen_ec = ISC_FALSE;
00048         isc_boolean_t seen_ce = ISC_FALSE;
00049         isc_boolean_t have_atom = ISC_FALSE;
00050         int group = 0;
00051         int range = 0;
00052         int sub = 0;
00053         isc_boolean_t empty_ok = ISC_FALSE;
00054         isc_boolean_t neg = ISC_FALSE;
00055         isc_boolean_t was_multiple = ISC_FALSE;
00056         unsigned int low = 0;
00057         unsigned int high = 0;
00058         const char *ccname = NULL;
00059         int range_start = 0;
00060 #if VALREGEX_REPORT_REASON
00061         const char *reason = "";
00062 #endif
00063 
00064         if (c == NULL || *c == 0)
00065                 FAIL("empty string");
00066 
00067         while (c != NULL && *c != 0) {
00068                 switch (state) {
00069                 case none:
00070                         switch (*c) {
00071                         case '\\':      /* make literal */
00072                                 ++c;
00073                                 switch (*c) {
00074                                 case '1': case '2': case '3':
00075                                 case '4': case '5': case '6':
00076                                 case '7': case '8': case '9':
00077                                         if ((*c - '0') > sub)
00078                                                 FAIL("bad back reference");
00079                                         have_atom = ISC_TRUE;
00080                                         was_multiple = ISC_FALSE;
00081                                         break;
00082                                 case 0:
00083                                         FAIL("escaped end-of-string");
00084                                 default:
00085                                         goto literal;
00086                                 }
00087                                 ++c;
00088                                 break;
00089                         case '[':       /* bracket start */
00090                                 ++c;
00091                                 neg = ISC_FALSE;
00092                                 was_multiple = ISC_FALSE;
00093                                 seen_char = ISC_FALSE;
00094                                 state = parse_bracket;
00095                                 break;
00096                         case '{':       /* bound start */
00097                                 switch (c[1]) {
00098                                 case '0': case '1': case '2': case '3':
00099                                 case '4': case '5': case '6': case '7':
00100                                 case '8': case '9':
00101                                         if (!have_atom)
00102                                                 FAIL("no atom");
00103                                         if (was_multiple)
00104                                                 FAIL("was multiple");
00105                                         seen_comma = ISC_FALSE;
00106                                         seen_high = ISC_FALSE;
00107                                         low = high = 0;
00108                                         state = parse_bound;
00109                                         break;
00110                                 default:
00111                                         goto literal;
00112                                 }
00113                                 ++c;
00114                                 have_atom = ISC_TRUE;
00115                                 was_multiple = ISC_TRUE;
00116                                 break;
00117                         case '}':
00118                                 goto literal;
00119                         case '(':       /* group start */
00120                                 have_atom = ISC_FALSE;
00121                                 was_multiple = ISC_FALSE;
00122                                 empty_ok = ISC_TRUE;
00123                                 ++group;
00124                                 ++sub;
00125                                 ++c;
00126                                 break;
00127                         case ')':       /* group end */
00128                                 if (group && !have_atom && !empty_ok)
00129                                         FAIL("empty alternative");
00130                                 have_atom = ISC_TRUE;
00131                                 was_multiple = ISC_FALSE;
00132                                 if (group != 0)
00133                                         --group;
00134                                 ++c;
00135                                 break;
00136                         case '|':       /* alternative seperator */
00137                                 if (!have_atom)
00138                                         FAIL("no atom");
00139                                 have_atom = ISC_FALSE;
00140                                 empty_ok = ISC_FALSE;
00141                                 was_multiple = ISC_FALSE;
00142                                 ++c;
00143                                 break;
00144                         case '^':
00145                         case '$':
00146                                 have_atom = ISC_TRUE;
00147                                 was_multiple = ISC_TRUE;
00148                                 ++c;
00149                                 break;
00150                         case '+':
00151                         case '*':
00152                         case '?':
00153                                 if (was_multiple)
00154                                         FAIL("was multiple");
00155                                 if (!have_atom)
00156                                         FAIL("no atom");
00157                                 have_atom = ISC_TRUE;
00158                                 was_multiple = ISC_TRUE;
00159                                 ++c;
00160                                 break;
00161                         case '.':
00162                         default:
00163                         literal:
00164                                 have_atom = ISC_TRUE;
00165                                 was_multiple = ISC_FALSE;
00166                                 ++c;
00167                                 break;
00168                         }
00169                         break;
00170                 case parse_bound:
00171                         switch (*c) {
00172                         case '0': case '1': case '2': case '3': case '4':
00173                         case '5': case '6': case '7': case '8': case '9':
00174                                 if (!seen_comma) {
00175                                         low = low * 10 + *c - '0';
00176                                         if (low > 255)
00177                                                 FAIL("lower bound too big");
00178                                 } else {
00179                                         seen_high = ISC_TRUE;
00180                                         high = high * 10 + *c - '0';
00181                                         if (high > 255)
00182                                                 FAIL("upper bound too big");
00183                                 }
00184                                 ++c;
00185                                 break;
00186                         case ',':
00187                                 if (seen_comma)
00188                                         FAIL("multiple commas");
00189                                 seen_comma = ISC_TRUE;
00190                                 ++c;
00191                                 break;
00192                         default:
00193                         case '{':
00194                                 FAIL("non digit/comma");
00195                         case '}':
00196                                 if (seen_high && low > high)
00197                                         FAIL("bad parse bound");
00198                                 seen_comma = ISC_FALSE;
00199                                 state = none;
00200                                 ++c;
00201                                 break;
00202                         }
00203                         break;
00204                 case parse_bracket:
00205                         switch (*c) {
00206                         case '^':
00207                                 if (seen_char || neg) goto inside;
00208                                 neg = ISC_TRUE;
00209                                 ++c;
00210                                 break;
00211                         case '-':
00212                                 if (range == 2) goto inside;
00213                                 if (!seen_char) goto inside;
00214                                 if (range == 1)
00215                                         FAIL("bad range");
00216                                 range = 2;
00217                                 ++c;
00218                                 break;
00219                         case '[':
00220                                 ++c;
00221                                 switch (*c) {
00222                                 case '.':       /* collating element */
00223                                         if (range != 0) --range;
00224                                         ++c;
00225                                         state = parse_ce;
00226                                         seen_ce = ISC_FALSE;
00227                                         break;
00228                                 case '=':       /* equivalence class */
00229                                         if (range == 2)
00230                                             FAIL("equivalence class in range");
00231                                         ++c;
00232                                         state = parse_ec;
00233                                         seen_ec = ISC_FALSE;
00234                                         break;
00235                                 case ':':       /* character class */
00236                                         if (range == 2)
00237                                               FAIL("character class in range");
00238                                         ccname = c;
00239                                         ++c;
00240                                         state = parse_cc;
00241                                         break;
00242                                 }
00243                                 seen_char = ISC_TRUE;
00244                                 break;
00245                         case ']':
00246                                 if (!c[1] && !seen_char)
00247                                         FAIL("unfinished brace");
00248                                 if (!seen_char)
00249                                         goto inside;
00250                                 ++c;
00251                                 range = 0;
00252                                 have_atom = ISC_TRUE;
00253                                 state = none;
00254                                 break;
00255                         default:
00256                         inside:
00257                                 seen_char = ISC_TRUE;
00258                                 if (range == 2 && (*c & 0xff) < range_start)
00259                                         FAIL("out of order range");
00260                                 if (range != 0)
00261                                         --range;
00262                                 range_start = *c & 0xff;
00263                                 ++c;
00264                                 break;
00265                         };
00266                         break;
00267                 case parse_ce:
00268                         switch (*c) {
00269                         case '.':
00270                                 ++c;
00271                                 switch (*c) {
00272                                 case ']':
00273                                         if (!seen_ce)
00274                                                  FAIL("empty ce");
00275                                         ++c;
00276                                         state = parse_bracket;
00277                                         break;
00278                                 default:
00279                                         if (seen_ce)
00280                                                 range_start = 256;
00281                                         else
00282                                                 range_start = '.';
00283                                         seen_ce = ISC_TRUE;
00284                                         break;
00285                                 }
00286                                 break;
00287                         default:
00288                                 if (seen_ce)
00289                                         range_start = 256;
00290                                 else
00291                                         range_start = *c;
00292                                 seen_ce = ISC_TRUE;
00293                                 ++c;
00294                                 break;
00295                         }
00296                         break;
00297                 case parse_ec:
00298                         switch (*c) {
00299                         case '=':
00300                                 ++c;
00301                                 switch (*c) {
00302                                 case ']':
00303                                         if (!seen_ec)
00304                                                 FAIL("no ec");
00305                                         ++c;
00306                                         state = parse_bracket;
00307                                         break;
00308                                 default:
00309                                         seen_ec = ISC_TRUE;
00310                                         break;
00311                                 }
00312                                 break;
00313                         default:
00314                                 seen_ec = ISC_TRUE;
00315                                 ++c;
00316                                 break;
00317                         }
00318                         break;
00319                 case parse_cc:
00320                         switch (*c) {
00321                         case ':':
00322                                 ++c;
00323                                 switch (*c) {
00324                                 case ']': {
00325                                         unsigned int i;
00326                                         isc_boolean_t found = ISC_FALSE;
00327                                         for (i = 0;
00328                                              i < sizeof(cc)/sizeof(*cc);
00329                                              i++)
00330                                         {
00331                                                 unsigned int len;
00332                                                 len = strlen(cc[i]);
00333                                                 if (len !=
00334                                                     (unsigned int)(c - ccname))
00335                                                         continue;
00336                                                 if (strncmp(cc[i], ccname, len))
00337                                                         continue;
00338                                                 found = ISC_TRUE;
00339                                         }
00340                                         if (!found)
00341                                                 FAIL("unknown cc");
00342                                         ++c;
00343                                         state = parse_bracket;
00344                                         break;
00345                                         }
00346                                 default:
00347                                         break;
00348                                 }
00349                                 break;
00350                         default:
00351                                 ++c;
00352                                 break;
00353                         }
00354                         break;
00355                 }
00356         }
00357         if (group != 0)
00358                 FAIL("group open");
00359         if (state != none)
00360                 FAIL("incomplete");
00361         if (!have_atom)
00362                 FAIL("no atom");
00363         return (sub);
00364 
00365  error:
00366 #if VALREGEX_REPORT_REASON
00367         fprintf(stderr, "%s\n", reason);
00368 #endif
00369         return (-1);
00370 }