summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/ChangeLog14
-rw-r--r--src/regex.c227
2 files changed, 118 insertions, 123 deletions
diff --git a/src/ChangeLog b/src/ChangeLog
index 3a01bac0160..d3ea1be941d 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,17 @@
+2000-03-27 Stefan Monnier <monnier@cs.yale.edu>
+
+ * regex.c (REGEX_FREE_STACK, RESET_FAIL_STACK): Make them usable as
+ an expression.
+ (enum re_opcode_t): Update description of succeed_n.
+ (PATFETCH): Always define.
+ (regex_compile): Use lookahead rather than PATUNFETCH (for repetition
+ operators, char classes, shy-groups and intervals).
+ Optimize special cases of intervals so as to only use succeed_n and
+ jump_n when really needed.
+ (re_compile_fastmap): Simplify handling of jump_n and succeed_n now
+ that we don't have to handle the special cases any more.
+ Simplify on_failure_jump handling as well.
+
2000-03-28 Jason Rumney <jasonr@gnu.org>
* lread.c (Fload): Move safe_p definition to above #ifdef DOS_NT.
diff --git a/src/regex.c b/src/regex.c
index 671f6c152f3..f7ca3a7b8b4 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -21,7 +21,6 @@
/* TODO:
- use analyze_first to optimize non-empty loops
- - optimize succeed_n and jump_n away when possible
- clean up multibyte issues
- structure the opcode space into opcode+flag.
- merge with glic's regex.[ch]
@@ -411,7 +410,7 @@ char *alloca ();
#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
REGEX_REALLOCATE (source, osize, nsize)
/* No need to explicitly free anything. */
-#define REGEX_FREE_STACK(arg)
+#define REGEX_FREE_STACK(arg) ((void)0)
#endif /* not REGEX_MALLOC */
#endif /* not using relocating allocator */
@@ -546,7 +545,9 @@ typedef enum
on_failure_jump_smart,
/* Followed by two-byte relative address and two-byte number n.
- After matching N times, jump to the address upon failure. */
+ After matching N times, jump to the address upon failure.
+ Does not work if N starts at 0: use on_failure_jump_loop
+ instead. */
succeed_n,
/* Followed by two-byte relative address, and two-byte number n.
@@ -1289,7 +1290,7 @@ typedef struct
fail_stack.frame = 0; \
} while (0)
-#define RESET_FAIL_STACK()
+#define RESET_FAIL_STACK() ((void)0)
#endif
@@ -1546,13 +1547,11 @@ static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
if necessary. Also cast from a signed character in the constant
string passed to us by the user to an unsigned char that we can use
as an array index (in, e.g., `translate'). */
-#ifndef PATFETCH
#define PATFETCH(c) \
do { \
PATFETCH_RAW (c); \
if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
} while (0)
-#endif
/* Fetch the next character in the uncompiled pattern, with no
translation. */
@@ -2103,35 +2102,24 @@ regex_compile (pattern, size, syntax, bufp)
if (p == pend)
break;
-
- PATFETCH (c);
-
- if (c == '*'
- || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
+ else if (*p == '*'
+ || (!(syntax & RE_BK_PLUS_QM)
+ && (*p == '+' || *p == '?')))
;
-
- else if (syntax & RE_BK_PLUS_QM && c == '\\')
+ else if (syntax & RE_BK_PLUS_QM && *p == '\\')
{
- if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
-
- PATFETCH (c1);
- if (!(c1 == '+' || c1 == '?'))
- {
- PATUNFETCH;
- PATUNFETCH;
- break;
- }
-
- c = c1;
+ if (p+1 == pend)
+ FREE_STACK_RETURN (REG_EESCAPE);
+ if (p[1] == '+' || p[1] == '?')
+ PATFETCH (c); /* Gobble up the backslash. */
+ else
+ break;
}
else
- {
- PATUNFETCH;
- break;
- }
-
+ break;
/* If we get here, we found another repeat character. */
- }
+ PATFETCH (c);
+ }
/* Star, etc. applied to an empty pattern is equivalent
to an empty pattern. */
@@ -2306,9 +2294,11 @@ regex_compile (pattern, size, syntax, bufp)
{
/* Leave room for the null. */
char str[CHAR_CLASS_MAX_LENGTH + 1];
+ const unsigned char *class_beg;
PATFETCH (c);
c1 = 0;
+ class_beg = p;
/* If pattern is `[[:'. */
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
@@ -2421,9 +2411,8 @@ regex_compile (pattern, size, syntax, bufp)
}
else
{
- c1++;
- while (c1--)
- PATUNFETCH;
+ /* Go back to right after the "[:". */
+ p = class_beg;
SET_LIST_BIT ('[');
/* Because the `:' may starts the range, we
@@ -2584,9 +2573,9 @@ regex_compile (pattern, size, syntax, bufp)
if (p+1 < pend)
{
/* Look for a special (?...) construct */
- PATFETCH (c);
- if ((syntax & RE_SHY_GROUPS) && c == '?')
+ if ((syntax & RE_SHY_GROUPS) && *p == '?')
{
+ PATFETCH (c); /* Gobble up the '?'. */
PATFETCH (c);
switch (c)
{
@@ -2596,7 +2585,6 @@ regex_compile (pattern, size, syntax, bufp)
FREE_STACK_RETURN (REG_BADPAT);
}
}
- else PATUNFETCH;
}
if (!shy)
@@ -2744,8 +2732,9 @@ regex_compile (pattern, size, syntax, bufp)
if (!(syntax & RE_INTERVALS)
/* If we're at `\{' and it's not the open-interval
operator. */
- || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
- || (p - 2 == pattern && p == pend))
+ || (syntax & RE_NO_BK_BRACES)
+ /* What is that? -sm */
+ /* || (p - 2 == pattern && p == pend) */)
goto normal_backslash;
handle_interval:
@@ -2755,7 +2744,7 @@ regex_compile (pattern, size, syntax, bufp)
/* At least (most) this many matches must be made. */
int lower_bound = 0, upper_bound = -1;
- beg_interval = p - 1;
+ beg_interval = p;
if (p == pend)
{
@@ -2768,16 +2757,13 @@ regex_compile (pattern, size, syntax, bufp)
GET_UNSIGNED_NUMBER (lower_bound);
if (c == ',')
- {
- GET_UNSIGNED_NUMBER (upper_bound);
- if (upper_bound < 0) upper_bound = RE_DUP_MAX;
- }
+ GET_UNSIGNED_NUMBER (upper_bound);
else
/* Interval such as `{1}' => match exactly once. */
upper_bound = lower_bound;
if (lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound)
+ || (upper_bound >= 0 && lower_bound > upper_bound))
{
if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
@@ -2813,15 +2799,13 @@ regex_compile (pattern, size, syntax, bufp)
goto unfetch_interval;
}
- /* If the upper bound is zero, don't want to succeed at
- all; jump from `laststart' to `b + 3', which will be
- the end of the buffer after we insert the jump. */
if (upper_bound == 0)
- {
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (jump, laststart, b + 3);
- b += 3;
- }
+ /* If the upper bound is zero, just drop the sub pattern
+ altogether. */
+ b = laststart;
+ else if (lower_bound == 1 && upper_bound == 1)
+ /* Just match it once: nothing to do here. */
+ ;
/* Otherwise, we have a nontrivial interval. When
we're all done, the pattern will look like:
@@ -2835,28 +2819,49 @@ regex_compile (pattern, size, syntax, bufp)
else
{ /* If the upper bound is > 1, we need to insert
more at the end of the loop. */
- unsigned nbytes = 10 + (upper_bound > 1) * 10;
-
- GET_BUFFER_SPACE (nbytes);
-
- /* Initialize lower bound of the `succeed_n', even
- though it will be set during matching by its
- attendant `set_number_at' (inserted next),
- because `re_compile_fastmap' needs to know.
- Jump to the `jump_n' we might insert below. */
- INSERT_JUMP2 (succeed_n, laststart,
- b + 5 + (upper_bound > 1) * 5,
- lower_bound);
- b += 5;
-
- /* Code to initialize the lower bound. Insert
- before the `succeed_n'. The `5' is the last two
- bytes of this `set_number_at', plus 3 bytes of
- the following `succeed_n'. */
- insert_op2 (set_number_at, laststart, 5, lower_bound, b);
- b += 5;
-
- if (upper_bound > 1)
+ unsigned int nbytes = (upper_bound < 0 ? 3
+ : upper_bound > 1 ? 5 : 0);
+ unsigned int startoffset = 0;
+
+ GET_BUFFER_SPACE (20); /* We might use less. */
+
+ if (lower_bound == 0)
+ {
+ /* A succeed_n that starts with 0 is really a
+ a simple on_failure_jump_loop. */
+ INSERT_JUMP (on_failure_jump_loop, laststart,
+ b + 3 + nbytes);
+ b += 3;
+ }
+ else
+ {
+ /* Initialize lower bound of the `succeed_n', even
+ though it will be set during matching by its
+ attendant `set_number_at' (inserted next),
+ because `re_compile_fastmap' needs to know.
+ Jump to the `jump_n' we might insert below. */
+ INSERT_JUMP2 (succeed_n, laststart,
+ b + 5 + nbytes,
+ lower_bound);
+ b += 5;
+
+ /* Code to initialize the lower bound. Insert
+ before the `succeed_n'. The `5' is the last two
+ bytes of this `set_number_at', plus 3 bytes of
+ the following `succeed_n'. */
+ insert_op2 (set_number_at, laststart, 5, lower_bound, b);
+ b += 5;
+ startoffset += 5;
+ }
+
+ if (upper_bound < 0)
+ {
+ /* A negative upper bound stands for infinity,
+ in which case it degenerates to a plain jump. */
+ STORE_JUMP (jump, b, laststart + startoffset);
+ b += 3;
+ }
+ else if (upper_bound > 1)
{ /* More than one repetition is allowed, so
append a backward jump to the `succeed_n'
that starts this interval.
@@ -2864,7 +2869,7 @@ regex_compile (pattern, size, syntax, bufp)
When we've reached this during matching,
we'll have matched the interval once, so
jump back only `upper_bound - 1' times. */
- STORE_JUMP2 (jump_n, b, laststart + 5,
+ STORE_JUMP2 (jump_n, b, laststart + startoffset,
upper_bound - 1);
b += 5;
@@ -2899,14 +2904,15 @@ regex_compile (pattern, size, syntax, bufp)
beg_interval = NULL;
/* normal_char and normal_backslash need `c'. */
- PATFETCH (c);
+ c = '{';
if (!(syntax & RE_NO_BK_BRACES))
{
- if (p > pattern && p[-1] == '\\')
- goto normal_backslash;
+ assert (p > pattern && p[-1] == '\\');
+ goto normal_backslash;
}
- goto normal_char;
+ else
+ goto normal_char;
#ifdef emacs
/* There is no way to specify the before_dot and after_dot
@@ -3311,9 +3317,6 @@ re_compile_fastmap (bufp)
match the empty string. */
boolean path_can_be_null = true;
- /* We aren't doing a `succeed_n' to begin with. */
- boolean succeed_n_p = false;
-
/* If all elements for base leading-codes in fastmap is set, this
flag is set true. */
boolean match_any_multibyte_characters = false;
@@ -3353,7 +3356,7 @@ re_compile_fastmap (bufp)
as used for the *? operator. */
unsigned char *p1 = p;
- if (p == pend || *p == succeed)
+ if (p >= pend || *p == succeed)
{
/* We have reached the (effective) end of pattern. */
if (!PATTERN_STACK_EMPTY ())
@@ -3510,7 +3513,6 @@ re_compile_fastmap (bufp)
continue;
- case jump_n:
case jump:
EXTRACT_NUMBER_AND_INCR (j, p);
if (j < 0)
@@ -3539,51 +3541,30 @@ re_compile_fastmap (bufp)
case on_failure_jump_nastyloop:
case on_failure_jump_loop:
case on_failure_jump_smart:
- handle_on_failure_jump:
EXTRACT_NUMBER_AND_INCR (j, p);
-
- /* For some patterns, e.g., `(a?)?', `p+j' here points to the
- end of the pattern. We don't want to push such a point,
- since when we restore it above, entering the switch will
- increment `p' past the end of the pattern. We don't need
- to push such a point since we obviously won't find any more
- fastmap entries beyond `pend'. Such a pattern can match
- the null string, though. */
if (p + j <= p1)
- /* Backward jump to be ignored. */
- ;
- else if (p + j < pend)
- {
- if (!PUSH_PATTERN_OP (p + j, fail_stack))
- {
- RESET_FAIL_STACK ();
- return -2;
- }
- }
- else
- bufp->can_be_null = 1;
-
- if (succeed_n_p)
- {
- EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
- succeed_n_p = false;
- }
-
+ ; /* Backward jump to be ignored. */
+ else if (!PUSH_PATTERN_OP (p + j, fail_stack))
+ return (RESET_FAIL_STACK (), -2);
continue;
+ case jump_n:
+ /* This code simply does not properly handle forward jump_n. */
+ DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
+ p += 4;
+ /* jump_n can either jump or fall through. The (backward) jump
+ case has already been handled, so we only need to look at the
+ fallthrough case. */
+ continue;
+
case succeed_n:
- /* Get to the number of times to succeed. */
- p += 2;
-
- /* Increment p past the n for when k != 0. */
- EXTRACT_NUMBER_AND_INCR (k, p);
- if (k == 0)
- {
- p -= 4;
- succeed_n_p = true; /* Spaghetti code alert. */
- goto handle_on_failure_jump;
- }
+ /* If N == 0, it should be an on_failure_jump_loop instead. */
+ DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
+ p += 4;
+ /* We only care about one iteration of the loop, so we don't
+ need to consider the case where this behaves like an
+ on_failure_jump. */
continue;