diff options
Diffstat (limited to 'jim-regexp.c')
-rw-r--r-- | jim-regexp.c | 573 |
1 files changed, 573 insertions, 0 deletions
diff --git a/jim-regexp.c b/jim-regexp.c new file mode 100644 index 0000000..8eb457d --- /dev/null +++ b/jim-regexp.c @@ -0,0 +1,573 @@ +/* + * Implements the regexp and regsub commands for Jim + * + * (c) 2008 Steve Bennett <steveb@workware.net.au> + * + * Uses C library regcomp()/regexec() for the matching. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of the Jim Tcl Project. + * + * Based on code originally from Tcl 6.7: + * + * Copyright 1987-1991 Regents of the University of California + * Permission to use, copy, modify, and distribute this + * software and its documentation for any purpose and without + * fee is hereby granted, provided that the above copyright + * notice appear in all copies. The University of California + * makes no representations about the suitability of this + * software for any purpose. It is provided "as is" without + * express or implied warranty. + */ + +#include <stdlib.h> +#include <string.h> + +#include "jimautoconf.h" +#if defined(JIM_REGEXP) + #include "jimregexp.h" +#else + #include <regex.h> +#endif +#include "jim.h" + +static void FreeRegexpInternalRep(Jim_Interp *interp, Jim_Obj *objPtr) +{ + regfree(objPtr->internalRep.regexpValue.compre); + Jim_Free(objPtr->internalRep.regexpValue.compre); +} + +static const Jim_ObjType regexpObjType = { + "regexp", + FreeRegexpInternalRep, + NULL, + NULL, + JIM_TYPE_NONE +}; + +static regex_t *SetRegexpFromAny(Jim_Interp *interp, Jim_Obj *objPtr, unsigned flags) +{ + regex_t *compre; + const char *pattern; + int ret; + + /* Check if the object is already an uptodate variable */ + if (objPtr->typePtr == ®expObjType && + objPtr->internalRep.regexpValue.compre && objPtr->internalRep.regexpValue.flags == flags) { + /* nothing to do */ + return objPtr->internalRep.regexpValue.compre; + } + + /* Not a regexp or the flags do not match */ + + /* Get the string representation */ + pattern = Jim_String(objPtr); + compre = Jim_Alloc(sizeof(regex_t)); + + if ((ret = regcomp(compre, pattern, REG_EXTENDED | flags)) != 0) { + char buf[100]; + + regerror(ret, compre, buf, sizeof(buf)); + Jim_SetResultFormatted(interp, "couldn't compile regular expression pattern: %s", buf); + regfree(compre); + Jim_Free(compre); + return NULL; + } + + Jim_FreeIntRep(interp, objPtr); + + objPtr->typePtr = ®expObjType; + objPtr->internalRep.regexpValue.flags = flags; + objPtr->internalRep.regexpValue.compre = compre; + + return compre; +} + +int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) +{ + int opt_indices = 0; + int opt_all = 0; + int opt_inline = 0; + regex_t *regex; + int match, i, j; + int offset = 0; + regmatch_t *pmatch = NULL; + int source_len; + int result = JIM_OK; + const char *pattern; + const char *source_str; + int num_matches = 0; + int num_vars; + Jim_Obj *resultListObj = NULL; + int regcomp_flags = 0; + int eflags = 0; + int option; + enum { + OPT_INDICES, OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_INLINE, OPT_START, OPT_END + }; + static const char * const options[] = { + "-indices", "-nocase", "-line", "-all", "-inline", "-start", "--", NULL + }; + + if (argc < 3) { + wrongNumArgs: + Jim_WrongNumArgs(interp, 1, argv, + "?-switch ...? exp string ?matchVar? ?subMatchVar ...?"); + return JIM_ERR; + } + + for (i = 1; i < argc; i++) { + const char *opt = Jim_String(argv[i]); + + if (*opt != '-') { + break; + } + if (Jim_GetEnum(interp, argv[i], options, &option, "switch", JIM_ERRMSG | JIM_ENUM_ABBREV) != JIM_OK) { + return JIM_ERR; + } + if (option == OPT_END) { + i++; + break; + } + switch (option) { + case OPT_INDICES: + opt_indices = 1; + break; + + case OPT_NOCASE: + regcomp_flags |= REG_ICASE; + break; + + case OPT_LINE: + regcomp_flags |= REG_NEWLINE; + break; + + case OPT_ALL: + opt_all = 1; + break; + + case OPT_INLINE: + opt_inline = 1; + break; + + case OPT_START: + if (++i == argc) { + goto wrongNumArgs; + } + if (Jim_GetIndex(interp, argv[i], &offset) != JIM_OK) { + return JIM_ERR; + } + break; + } + } + if (argc - i < 2) { + goto wrongNumArgs; + } + + regex = SetRegexpFromAny(interp, argv[i], regcomp_flags); + if (!regex) { + return JIM_ERR; + } + + pattern = Jim_String(argv[i]); + source_str = Jim_GetString(argv[i + 1], &source_len); + + num_vars = argc - i - 2; + + if (opt_inline) { + if (num_vars) { + Jim_SetResultString(interp, "regexp match variables not allowed when using -inline", + -1); + result = JIM_ERR; + goto done; + } + num_vars = regex->re_nsub + 1; + } + + pmatch = Jim_Alloc((num_vars + 1) * sizeof(*pmatch)); + + /* If an offset has been specified, adjust for that now. + * If it points past the end of the string, point to the terminating null + */ + if (offset) { + if (offset < 0) { + offset += source_len + 1; + } + if (offset > source_len) { + source_str += source_len; + } + else if (offset > 0) { + source_str += offset; + } + eflags |= REG_NOTBOL; + } + + if (opt_inline) { + resultListObj = Jim_NewListObj(interp, NULL, 0); + } + + next_match: + match = regexec(regex, source_str, num_vars + 1, pmatch, eflags); + if (match >= REG_BADPAT) { + char buf[100]; + + regerror(match, regex, buf, sizeof(buf)); + Jim_SetResultFormatted(interp, "error while matching pattern: %s", buf); + result = JIM_ERR; + goto done; + } + + if (match == REG_NOMATCH) { + goto done; + } + + num_matches++; + + if (opt_all && !opt_inline) { + /* Just count the number of matches, so skip the substitution h */ + goto try_next_match; + } + + /* + * If additional variable names have been specified, return + * index information in those variables. + */ + + j = 0; + for (i += 2; opt_inline ? j < num_vars : i < argc; i++, j++) { + Jim_Obj *resultObj; + + if (opt_indices) { + resultObj = Jim_NewListObj(interp, NULL, 0); + } + else { + resultObj = Jim_NewStringObj(interp, "", 0); + } + + if (pmatch[j].rm_so == -1) { + if (opt_indices) { + Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1)); + Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1)); + } + } + else { + int len = pmatch[j].rm_eo - pmatch[j].rm_so; + + if (opt_indices) { + Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, + offset + pmatch[j].rm_so)); + Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, + offset + pmatch[j].rm_so + len - 1)); + } + else { + Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, len); + } + } + + if (opt_inline) { + Jim_ListAppendElement(interp, resultListObj, resultObj); + } + else { + /* And now set the result variable */ + result = Jim_SetVariable(interp, argv[i], resultObj); + + if (result != JIM_OK) { + Jim_FreeObj(interp, resultObj); + break; + } + } + } + + try_next_match: + if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) { + if (pmatch[0].rm_eo) { + offset += pmatch[0].rm_eo; + source_str += pmatch[0].rm_eo; + } + else { + source_str++; + offset++; + } + if (*source_str) { + eflags = REG_NOTBOL; + goto next_match; + } + } + + done: + if (result == JIM_OK) { + if (opt_inline) { + Jim_SetResult(interp, resultListObj); + } + else { + Jim_SetResultInt(interp, num_matches); + } + } + + Jim_Free(pmatch); + return result; +} + +#define MAX_SUB_MATCHES 50 + +int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) +{ + int regcomp_flags = 0; + int regexec_flags = 0; + int opt_all = 0; + int offset = 0; + regex_t *regex; + const char *p; + int result; + regmatch_t pmatch[MAX_SUB_MATCHES + 1]; + int num_matches = 0; + + int i, j, n; + Jim_Obj *varname; + Jim_Obj *resultObj; + const char *source_str; + int source_len; + const char *replace_str; + int replace_len; + const char *pattern; + int option; + enum { + OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_END + }; + static const char * const options[] = { + "-nocase", "-line", "-all", "-start", "--", NULL + }; + + if (argc < 4) { + wrongNumArgs: + Jim_WrongNumArgs(interp, 1, argv, + "?-switch ...? exp string subSpec ?varName?"); + return JIM_ERR; + } + + for (i = 1; i < argc; i++) { + const char *opt = Jim_String(argv[i]); + + if (*opt != '-') { + break; + } + if (Jim_GetEnum(interp, argv[i], options, &option, "switch", JIM_ERRMSG | JIM_ENUM_ABBREV) != JIM_OK) { + return JIM_ERR; + } + if (option == OPT_END) { + i++; + break; + } + switch (option) { + case OPT_NOCASE: + regcomp_flags |= REG_ICASE; + break; + + case OPT_LINE: + regcomp_flags |= REG_NEWLINE; + break; + + case OPT_ALL: + opt_all = 1; + break; + + case OPT_START: + if (++i == argc) { + goto wrongNumArgs; + } + if (Jim_GetIndex(interp, argv[i], &offset) != JIM_OK) { + return JIM_ERR; + } + break; + } + } + if (argc - i != 3 && argc - i != 4) { + goto wrongNumArgs; + } + + regex = SetRegexpFromAny(interp, argv[i], regcomp_flags); + if (!regex) { + return JIM_ERR; + } + pattern = Jim_String(argv[i]); + + source_str = Jim_GetString(argv[i + 1], &source_len); + replace_str = Jim_GetString(argv[i + 2], &replace_len); + varname = argv[i + 3]; + + /* Create the result string */ + resultObj = Jim_NewStringObj(interp, "", 0); + + /* If an offset has been specified, adjust for that now. + * If it points past the end of the string, point to the terminating null + */ + if (offset) { + if (offset < 0) { + offset += source_len + 1; + } + if (offset > source_len) { + offset = source_len; + } + else if (offset < 0) { + offset = 0; + } + } + + /* Copy the part before -start */ + Jim_AppendString(interp, resultObj, source_str, offset); + + /* + * The following loop is to handle multiple matches within the + * same source string; each iteration handles one match and its + * corresponding substitution. If "-all" hasn't been specified + * then the loop body only gets executed once. + */ + + n = source_len - offset; + p = source_str + offset; + do { + int match = regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags); + + if (match >= REG_BADPAT) { + char buf[100]; + + regerror(match, regex, buf, sizeof(buf)); + Jim_SetResultFormatted(interp, "error while matching pattern: %s", buf); + return JIM_ERR; + } + if (match == REG_NOMATCH) { + break; + } + + num_matches++; + + /* + * Copy the portion of the source string before the match to the + * result variable. + */ + Jim_AppendString(interp, resultObj, p, pmatch[0].rm_so); + + /* + * Append the subSpec (replace_str) argument to the variable, making appropriate + * substitutions. This code is a bit hairy because of the backslash + * conventions and because the code saves up ranges of characters in + * subSpec to reduce the number of calls to Jim_SetVar. + */ + + for (j = 0; j < replace_len; j++) { + int idx; + int c = replace_str[j]; + + if (c == '&') { + idx = 0; + } + else if (c == '\\' && j < replace_len) { + c = replace_str[++j]; + if ((c >= '0') && (c <= '9')) { + idx = c - '0'; + } + else if ((c == '\\') || (c == '&')) { + Jim_AppendString(interp, resultObj, replace_str + j, 1); + continue; + } + else { + /* If the replacement is a trailing backslash, just replace with a backslash, otherwise + * with the literal backslash and the following character + */ + Jim_AppendString(interp, resultObj, replace_str + j - 1, (j == replace_len) ? 1 : 2); + continue; + } + } + else { + Jim_AppendString(interp, resultObj, replace_str + j, 1); + continue; + } + if ((idx < MAX_SUB_MATCHES) && pmatch[idx].rm_so != -1 && pmatch[idx].rm_eo != -1) { + Jim_AppendString(interp, resultObj, p + pmatch[idx].rm_so, + pmatch[idx].rm_eo - pmatch[idx].rm_so); + } + } + + p += pmatch[0].rm_eo; + n -= pmatch[0].rm_eo; + + /* If -all is not specified, or there is no source left, we are done */ + if (!opt_all || n == 0) { + break; + } + + /* An anchored pattern without -line must be done */ + if ((regcomp_flags & REG_NEWLINE) == 0 && pattern[0] == '^') { + break; + } + + /* If the pattern is empty, need to step forwards */ + if (pattern[0] == '\0' && n) { + /* Need to copy the char we are moving over */ + Jim_AppendString(interp, resultObj, p, 1); + p++; + n--; + } + + regexec_flags |= REG_NOTBOL; + } while (n); + + /* + * Copy the portion of the string after the last match to the + * result variable. + */ + Jim_AppendString(interp, resultObj, p, -1); + + /* And now set or return the result variable */ + if (argc - i == 4) { + result = Jim_SetVariable(interp, varname, resultObj); + + if (result == JIM_OK) { + Jim_SetResultInt(interp, num_matches); + } + else { + Jim_FreeObj(interp, resultObj); + } + } + else { + Jim_SetResult(interp, resultObj); + result = JIM_OK; + } + + return result; +} + +int Jim_regexpInit(Jim_Interp *interp) +{ + if (Jim_PackageProvide(interp, "regexp", "1.0", JIM_ERRMSG)) + return JIM_ERR; + + Jim_CreateCommand(interp, "regexp", Jim_RegexpCmd, NULL, NULL); + Jim_CreateCommand(interp, "regsub", Jim_RegsubCmd, NULL, NULL); + return JIM_OK; +} |