VirtualBox

source: kBuild/trunk/src/sed/lib/regexec.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 8 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

File size: 125.2 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22static void match_ctx_clean (re_match_context_t *mctx);
23static void match_ctx_free (re_match_context_t *cache);
24static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs, regmatch_t *prevregs,
63 re_node_set *eps_via_nodes);
64static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70static int sift_states_iter_mb (const re_match_context_t *mctx,
71 re_sift_context_t *sctx,
72 Idx node_idx, Idx str_idx, Idx max_str_idx);
73static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74 re_sift_context_t *sctx);
75static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76 re_sift_context_t *sctx, Idx str_idx,
77 re_node_set *cur_dest);
78static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx str_idx,
81 re_node_set *dest_nodes);
82static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83 re_node_set *dest_nodes,
84 const re_node_set *candidates);
85static bool check_dst_limits (const re_match_context_t *mctx,
86 const re_node_set *limits,
87 Idx dst_node, Idx dst_idx, Idx src_node,
88 Idx src_idx);
89static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90 int boundaries, Idx subexp_idx,
91 Idx from_node, Idx bkref_idx);
92static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93 Idx limit, Idx subexp_idx,
94 Idx node, Idx str_idx,
95 Idx bkref_idx);
96static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates,
99 re_node_set *limits,
100 struct re_backref_cache_entry *bkref_ents,
101 Idx str_idx);
102static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103 re_sift_context_t *sctx,
104 Idx str_idx, const re_node_set *candidates);
105static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
106 re_dfastate_t **dst,
107 re_dfastate_t **src, Idx num);
108static re_dfastate_t *find_recover_state (reg_errcode_t *err,
109 re_match_context_t *mctx);
110static re_dfastate_t *transit_state (reg_errcode_t *err,
111 re_match_context_t *mctx,
112 re_dfastate_t *state);
113static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
114 re_match_context_t *mctx,
115 re_dfastate_t *next_state);
116static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117 re_node_set *cur_nodes,
118 Idx str_idx);
119#if 0
120static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
121 re_match_context_t *mctx,
122 re_dfastate_t *pstate);
123#endif
124static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125 re_dfastate_t *pstate);
126static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127 const re_node_set *nodes);
128static reg_errcode_t get_subexp (re_match_context_t *mctx,
129 Idx bkref_node, Idx bkref_str_idx);
130static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131 const re_sub_match_top_t *sub_top,
132 re_sub_match_last_t *sub_last,
133 Idx bkref_node, Idx bkref_str);
134static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
135 Idx subexp_idx, int type);
136static reg_errcode_t check_arrival (re_match_context_t *mctx,
137 state_array_t *path, Idx top_node,
138 Idx top_str, Idx last_node, Idx last_str,
139 int type);
140static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
141 Idx str_idx,
142 re_node_set *cur_nodes,
143 re_node_set *next_nodes);
144static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145 re_node_set *cur_nodes,
146 Idx ex_subexp, int type);
147static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148 re_node_set *dst_nodes,
149 Idx target, Idx ex_subexp,
150 int type);
151static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152 re_node_set *cur_nodes, Idx cur_str,
153 Idx subexp_num, int type);
154static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
155static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156 const re_string_t *input, Idx idx);
157#ifdef _LIBC
158static unsigned int find_collation_sequence_value (const unsigned char *mbs,
159 size_t name_len);
160#endif
161static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162 const re_dfastate_t *state,
163 re_node_set *states_node,
164 bitset_t *states_ch);
165static bool check_node_accept (const re_match_context_t *mctx,
166 const re_token_t *node, Idx idx);
167static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
168
169
170/* Entry point for POSIX code. */
171
172/* regexec searches for a given pattern, specified by PREG, in the
173 string STRING.
174
175 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
176 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
177 least NMATCH elements, and we set them to the offsets of the
178 corresponding matched substrings.
179
180 EFLAGS specifies "execution flags" which affect matching: if
181 REG_NOTBOL is set, then ^ does not match at the beginning of the
182 string; if REG_NOTEOL is set, then $ does not match at the end.
183
184 Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
185 EFLAGS is invalid. */
186
187int
188regexec (const regex_t *__restrict preg, const char *__restrict string,
189 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
190{
191 reg_errcode_t err;
192 Idx start, length;
193 re_dfa_t *dfa = preg->buffer;
194
195 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
196 return REG_BADPAT;
197
198 if (eflags & REG_STARTEND)
199 {
200 start = pmatch[0].rm_so;
201 length = pmatch[0].rm_eo;
202 }
203 else
204 {
205 start = 0;
206 length = strlen (string);
207 }
208
209 lock_lock (dfa->lock);
210 if (preg->no_sub)
211 err = re_search_internal (preg, string, length, start, length,
212 length, 0, NULL, eflags);
213 else
214 err = re_search_internal (preg, string, length, start, length,
215 length, nmatch, pmatch, eflags);
216 lock_unlock (dfa->lock);
217 return err != REG_NOERROR;
218}
219
220#ifdef _LIBC
221libc_hidden_def (__regexec)
222
223# include <shlib-compat.h>
224versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
225
226# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
227__typeof__ (__regexec) __compat_regexec;
228
229int
230attribute_compat_text_section
231__compat_regexec (const regex_t *__restrict preg,
232 const char *__restrict string, size_t nmatch,
233 regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
234{
235 return regexec (preg, string, nmatch, pmatch,
236 eflags & (REG_NOTBOL | REG_NOTEOL));
237}
238compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
239# endif
240#endif
241
242/* Entry points for GNU code. */
243
244/* re_match, re_search, re_match_2, re_search_2
245
246 The former two functions operate on STRING with length LENGTH,
247 while the later two operate on concatenation of STRING1 and STRING2
248 with lengths LENGTH1 and LENGTH2, respectively.
249
250 re_match() matches the compiled pattern in BUFP against the string,
251 starting at index START.
252
253 re_search() first tries matching at index START, then it tries to match
254 starting from index START + 1, and so on. The last start position tried
255 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
256 way as re_match().)
257
258 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
259 the first STOP characters of the concatenation of the strings should be
260 concerned.
261
262 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
263 and all groups is stored in REGS. (For the "_2" variants, the offsets are
264 computed relative to the concatenation, not relative to the individual
265 strings.)
266
267 On success, re_match* functions return the length of the match, re_search*
268 return the position of the start of the match. They return -1 on
269 match failure, -2 on error. */
270
271regoff_t
272re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
273 Idx start, struct re_registers *regs)
274{
275 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
276}
277#ifdef _LIBC
278weak_alias (__re_match, re_match)
279#endif
280
281regoff_t
282re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
283 Idx start, regoff_t range, struct re_registers *regs)
284{
285 return re_search_stub (bufp, string, length, start, range, length, regs,
286 false);
287}
288#ifdef _LIBC
289weak_alias (__re_search, re_search)
290#endif
291
292regoff_t
293re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
294 const char *string2, Idx length2, Idx start,
295 struct re_registers *regs, Idx stop)
296{
297 return re_search_2_stub (bufp, string1, length1, string2, length2,
298 start, 0, regs, stop, true);
299}
300#ifdef _LIBC
301weak_alias (__re_match_2, re_match_2)
302#endif
303
304regoff_t
305re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
306 const char *string2, Idx length2, Idx start, regoff_t range,
307 struct re_registers *regs, Idx stop)
308{
309 return re_search_2_stub (bufp, string1, length1, string2, length2,
310 start, range, regs, stop, false);
311}
312#ifdef _LIBC
313weak_alias (__re_search_2, re_search_2)
314#endif
315
316static regoff_t
317re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
318 Idx length1, const char *string2, Idx length2, Idx start,
319 regoff_t range, struct re_registers *regs,
320 Idx stop, bool ret_len)
321{
322 const char *str;
323 regoff_t rval;
324 Idx len;
325 char *s = NULL;
326
327 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
328 || INT_ADD_WRAPV (length1, length2, &len))))
329 return -2;
330
331 /* Concatenate the strings. */
332 if (length2 > 0)
333 if (length1 > 0)
334 {
335 s = re_malloc (char, len);
336
337 if (__glibc_unlikely (s == NULL))
338 return -2;
339#ifdef _LIBC
340 memcpy (__mempcpy (s, string1, length1), string2, length2);
341#else
342 memcpy (s, string1, length1);
343 memcpy (s + length1, string2, length2);
344#endif
345 str = s;
346 }
347 else
348 str = string2;
349 else
350 str = string1;
351
352 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
353 ret_len);
354 re_free (s);
355 return rval;
356}
357
358/* The parameters have the same meaning as those of re_search.
359 Additional parameters:
360 If RET_LEN is true the length of the match is returned (re_match style);
361 otherwise the position of the match is returned. */
362
363static regoff_t
364re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
365 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
366 bool ret_len)
367{
368 reg_errcode_t result;
369 regmatch_t *pmatch;
370 Idx nregs;
371 regoff_t rval;
372 int eflags = 0;
373 re_dfa_t *dfa = bufp->buffer;
374 Idx last_start = start + range;
375
376 /* Check for out-of-range. */
377 if (__glibc_unlikely (start < 0 || start > length))
378 return -1;
379 if (__glibc_unlikely (length < last_start
380 || (0 <= range && last_start < start)))
381 last_start = length;
382 else if (__glibc_unlikely (last_start < 0
383 || (range < 0 && start <= last_start)))
384 last_start = 0;
385
386 lock_lock (dfa->lock);
387
388 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
389 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
390
391 /* Compile fastmap if we haven't yet. */
392 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
393 re_compile_fastmap (bufp);
394
395 if (__glibc_unlikely (bufp->no_sub))
396 regs = NULL;
397
398 /* We need at least 1 register. */
399 if (regs == NULL)
400 nregs = 1;
401 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
402 && regs->num_regs <= bufp->re_nsub))
403 {
404 nregs = regs->num_regs;
405 if (__glibc_unlikely (nregs < 1))
406 {
407 /* Nothing can be copied to regs. */
408 regs = NULL;
409 nregs = 1;
410 }
411 }
412 else
413 nregs = bufp->re_nsub + 1;
414 pmatch = re_malloc (regmatch_t, nregs);
415 if (__glibc_unlikely (pmatch == NULL))
416 {
417 rval = -2;
418 goto out;
419 }
420
421 result = re_search_internal (bufp, string, length, start, last_start, stop,
422 nregs, pmatch, eflags);
423
424 rval = 0;
425
426 /* I hope we needn't fill their regs with -1's when no match was found. */
427 if (result != REG_NOERROR)
428 rval = result == REG_NOMATCH ? -1 : -2;
429 else if (regs != NULL)
430 {
431 /* If caller wants register contents data back, copy them. */
432 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
433 bufp->regs_allocated);
434 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
435 rval = -2;
436 }
437
438 if (__glibc_likely (rval == 0))
439 {
440 if (ret_len)
441 {
442 DEBUG_ASSERT (pmatch[0].rm_so == start);
443 rval = pmatch[0].rm_eo - start;
444 }
445 else
446 rval = pmatch[0].rm_so;
447 }
448 re_free (pmatch);
449 out:
450 lock_unlock (dfa->lock);
451 return rval;
452}
453
454static unsigned
455re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
456 int regs_allocated)
457{
458 int rval = REGS_REALLOCATE;
459 Idx i;
460 Idx need_regs = nregs + 1;
461 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
462 uses. */
463
464 /* Have the register data arrays been allocated? */
465 if (regs_allocated == REGS_UNALLOCATED)
466 { /* No. So allocate them with malloc. */
467 regs->start = re_malloc (regoff_t, need_regs);
468 if (__glibc_unlikely (regs->start == NULL))
469 return REGS_UNALLOCATED;
470 regs->end = re_malloc (regoff_t, need_regs);
471 if (__glibc_unlikely (regs->end == NULL))
472 {
473 re_free (regs->start);
474 return REGS_UNALLOCATED;
475 }
476 regs->num_regs = need_regs;
477 }
478 else if (regs_allocated == REGS_REALLOCATE)
479 { /* Yes. If we need more elements than were already
480 allocated, reallocate them. If we need fewer, just
481 leave it alone. */
482 if (__glibc_unlikely (need_regs > regs->num_regs))
483 {
484 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
485 regoff_t *new_end;
486 if (__glibc_unlikely (new_start == NULL))
487 return REGS_UNALLOCATED;
488 new_end = re_realloc (regs->end, regoff_t, need_regs);
489 if (__glibc_unlikely (new_end == NULL))
490 {
491 re_free (new_start);
492 return REGS_UNALLOCATED;
493 }
494 regs->start = new_start;
495 regs->end = new_end;
496 regs->num_regs = need_regs;
497 }
498 }
499 else
500 {
501 DEBUG_ASSERT (regs_allocated == REGS_FIXED);
502 /* This function may not be called with REGS_FIXED and nregs too big. */
503 DEBUG_ASSERT (nregs <= regs->num_regs);
504 rval = REGS_FIXED;
505 }
506
507 /* Copy the regs. */
508 for (i = 0; i < nregs; ++i)
509 {
510 regs->start[i] = pmatch[i].rm_so;
511 regs->end[i] = pmatch[i].rm_eo;
512 }
513 for ( ; i < regs->num_regs; ++i)
514 regs->start[i] = regs->end[i] = -1;
515
516 return rval;
517}
518
519/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
520 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
521 this memory for recording register information. STARTS and ENDS
522 must be allocated using the malloc library routine, and must each
523 be at least NUM_REGS * sizeof (regoff_t) bytes long.
524
525 If NUM_REGS == 0, then subsequent matches should allocate their own
526 register data.
527
528 Unless this function is called, the first search or match using
529 PATTERN_BUFFER will allocate its own register data, without
530 freeing the old data. */
531
532void
533re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
534 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
535{
536 if (num_regs)
537 {
538 bufp->regs_allocated = REGS_REALLOCATE;
539 regs->num_regs = num_regs;
540 regs->start = starts;
541 regs->end = ends;
542 }
543 else
544 {
545 bufp->regs_allocated = REGS_UNALLOCATED;
546 regs->num_regs = 0;
547 regs->start = regs->end = NULL;
548 }
549}
550#ifdef _LIBC
551weak_alias (__re_set_registers, re_set_registers)
552#endif
553
554
555/* Entry points compatible with 4.2 BSD regex library. We don't define
556 them unless specifically requested. */
557
558#if defined _REGEX_RE_COMP || defined _LIBC
559int
560# ifdef _LIBC
561weak_function
562# endif
563re_exec (const char *s)
564{
565 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
566}
567#endif /* _REGEX_RE_COMP */
568
569
570/* Internal entry point. */
571
572/* Searches for a compiled pattern PREG in the string STRING, whose
573 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
574 meaning as with regexec. LAST_START is START + RANGE, where
575 START and RANGE have the same meaning as with re_search.
576 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
577 otherwise return the error code.
578 Note: We assume front end functions already check ranges.
579 (0 <= LAST_START && LAST_START <= LENGTH) */
580
581static reg_errcode_t
582__attribute_warn_unused_result__
583re_search_internal (const regex_t *preg, const char *string, Idx length,
584 Idx start, Idx last_start, Idx stop, size_t nmatch,
585 regmatch_t pmatch[], int eflags)
586{
587 reg_errcode_t err;
588 const re_dfa_t *dfa = preg->buffer;
589 Idx left_lim, right_lim;
590 int incr;
591 bool fl_longest_match;
592 int match_kind;
593 Idx match_first;
594 Idx match_last = -1;
595 Idx extra_nmatch;
596 bool sb;
597 int ch;
598 re_match_context_t mctx = { .dfa = dfa };
599 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
600 && start != last_start && !preg->can_be_null)
601 ? preg->fastmap : NULL);
602 RE_TRANSLATE_TYPE t = preg->translate;
603
604 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
605 nmatch -= extra_nmatch;
606
607 /* Check if the DFA haven't been compiled. */
608 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
609 || dfa->init_state_word == NULL
610 || dfa->init_state_nl == NULL
611 || dfa->init_state_begbuf == NULL))
612 return REG_NOMATCH;
613
614 /* We assume front-end functions already check them. */
615 DEBUG_ASSERT (0 <= last_start && last_start <= length);
616
617 /* If initial states with non-begbuf contexts have no elements,
618 the regex must be anchored. If preg->newline_anchor is set,
619 we'll never use init_state_nl, so do not check it. */
620 if (dfa->init_state->nodes.nelem == 0
621 && dfa->init_state_word->nodes.nelem == 0
622 && (dfa->init_state_nl->nodes.nelem == 0
623 || !preg->newline_anchor))
624 {
625 if (start != 0 && last_start != 0)
626 return REG_NOMATCH;
627 start = last_start = 0;
628 }
629
630 /* We must check the longest matching, if nmatch > 0. */
631 fl_longest_match = (nmatch != 0 || dfa->nbackref);
632
633 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
634 preg->translate, (preg->syntax & RE_ICASE) != 0,
635 dfa);
636 if (__glibc_unlikely (err != REG_NOERROR))
637 goto free_return;
638 mctx.input.stop = stop;
639 mctx.input.raw_stop = stop;
640 mctx.input.newline_anchor = preg->newline_anchor;
641
642 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
643 if (__glibc_unlikely (err != REG_NOERROR))
644 goto free_return;
645
646 /* We will log all the DFA states through which the dfa pass,
647 if nmatch > 1, or this dfa has "multibyte node", which is a
648 back-reference or a node which can accept multibyte character or
649 multi character collating element. */
650 if (nmatch > 1 || dfa->has_mb_node)
651 {
652 /* Avoid overflow. */
653 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
654 <= mctx.input.bufs_len)))
655 {
656 err = REG_ESPACE;
657 goto free_return;
658 }
659
660 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
661 if (__glibc_unlikely (mctx.state_log == NULL))
662 {
663 err = REG_ESPACE;
664 goto free_return;
665 }
666 }
667
668 match_first = start;
669 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
670 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
671
672 /* Check incrementally whether the input string matches. */
673 incr = (last_start < start) ? -1 : 1;
674 left_lim = (last_start < start) ? last_start : start;
675 right_lim = (last_start < start) ? start : last_start;
676 sb = dfa->mb_cur_max == 1;
677 match_kind =
678 (fastmap
679 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
680 | (start <= last_start ? 2 : 0)
681 | (t != NULL ? 1 : 0))
682 : 8);
683
684 for (;; match_first += incr)
685 {
686 err = REG_NOMATCH;
687 if (match_first < left_lim || right_lim < match_first)
688 goto free_return;
689
690 /* Advance as rapidly as possible through the string, until we
691 find a plausible place to start matching. This may be done
692 with varying efficiency, so there are various possibilities:
693 only the most common of them are specialized, in order to
694 save on code size. We use a switch statement for speed. */
695 switch (match_kind)
696 {
697 case 8:
698 /* No fastmap. */
699 break;
700
701 case 7:
702 /* Fastmap with single-byte translation, match forward. */
703 while (__glibc_likely (match_first < right_lim)
704 && !fastmap[t[(unsigned char) string[match_first]]])
705 ++match_first;
706 goto forward_match_found_start_or_reached_end;
707
708 case 6:
709 /* Fastmap without translation, match forward. */
710 while (__glibc_likely (match_first < right_lim)
711 && !fastmap[(unsigned char) string[match_first]])
712 ++match_first;
713
714 forward_match_found_start_or_reached_end:
715 if (__glibc_unlikely (match_first == right_lim))
716 {
717 ch = match_first >= length
718 ? 0 : (unsigned char) string[match_first];
719 if (!fastmap[t ? t[ch] : ch])
720 goto free_return;
721 }
722 break;
723
724 case 4:
725 case 5:
726 /* Fastmap without multi-byte translation, match backwards. */
727 while (match_first >= left_lim)
728 {
729 ch = match_first >= length
730 ? 0 : (unsigned char) string[match_first];
731 if (fastmap[t ? t[ch] : ch])
732 break;
733 --match_first;
734 }
735 if (match_first < left_lim)
736 goto free_return;
737 break;
738
739 default:
740 /* In this case, we can't determine easily the current byte,
741 since it might be a component byte of a multibyte
742 character. Then we use the constructed buffer instead. */
743 for (;;)
744 {
745 /* If MATCH_FIRST is out of the valid range, reconstruct the
746 buffers. */
747 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
748 if (__glibc_unlikely (offset
749 >= (__re_size_t) mctx.input.valid_raw_len))
750 {
751 err = re_string_reconstruct (&mctx.input, match_first,
752 eflags);
753 if (__glibc_unlikely (err != REG_NOERROR))
754 goto free_return;
755
756 offset = match_first - mctx.input.raw_mbs_idx;
757 }
758 /* Use buffer byte if OFFSET is in buffer, otherwise '\0'. */
759 ch = (offset < mctx.input.valid_len
760 ? re_string_byte_at (&mctx.input, offset) : 0);
761 if (fastmap[ch])
762 break;
763 match_first += incr;
764 if (match_first < left_lim || match_first > right_lim)
765 {
766 err = REG_NOMATCH;
767 goto free_return;
768 }
769 }
770 break;
771 }
772
773 /* Reconstruct the buffers so that the matcher can assume that
774 the matching starts from the beginning of the buffer. */
775 err = re_string_reconstruct (&mctx.input, match_first, eflags);
776 if (__glibc_unlikely (err != REG_NOERROR))
777 goto free_return;
778
779 /* Don't consider this char as a possible match start if it part,
780 yet isn't the head, of a multibyte character. */
781 if (!sb && !re_string_first_byte (&mctx.input, 0))
782 continue;
783
784 /* It seems to be appropriate one, then use the matcher. */
785 /* We assume that the matching starts from 0. */
786 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
787 match_last = check_matching (&mctx, fl_longest_match,
788 start <= last_start ? &match_first : NULL);
789 if (match_last != -1)
790 {
791 if (__glibc_unlikely (match_last == -2))
792 {
793 err = REG_ESPACE;
794 goto free_return;
795 }
796 else
797 {
798 mctx.match_last = match_last;
799 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
800 {
801 re_dfastate_t *pstate = mctx.state_log[match_last];
802 mctx.last_node = check_halt_state_context (&mctx, pstate,
803 match_last);
804 }
805 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
806 || dfa->nbackref)
807 {
808 err = prune_impossible_nodes (&mctx);
809 if (err == REG_NOERROR)
810 break;
811 if (__glibc_unlikely (err != REG_NOMATCH))
812 goto free_return;
813 match_last = -1;
814 }
815 else
816 break; /* We found a match. */
817 }
818 }
819
820 match_ctx_clean (&mctx);
821 }
822
823 DEBUG_ASSERT (match_last != -1);
824 DEBUG_ASSERT (err == REG_NOERROR);
825
826 /* Set pmatch[] if we need. */
827 if (nmatch > 0)
828 {
829 Idx reg_idx;
830
831 /* Initialize registers. */
832 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
833 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
834
835 /* Set the points where matching start/end. */
836 pmatch[0].rm_so = 0;
837 pmatch[0].rm_eo = mctx.match_last;
838 /* FIXME: This function should fail if mctx.match_last exceeds
839 the maximum possible regoff_t value. We need a new error
840 code REG_OVERFLOW. */
841
842 if (!preg->no_sub && nmatch > 1)
843 {
844 err = set_regs (preg, &mctx, nmatch, pmatch,
845 dfa->has_plural_match && dfa->nbackref > 0);
846 if (__glibc_unlikely (err != REG_NOERROR))
847 goto free_return;
848 }
849
850 /* At last, add the offset to each register, since we slid
851 the buffers so that we could assume that the matching starts
852 from 0. */
853 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
854 if (pmatch[reg_idx].rm_so != -1)
855 {
856 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
857 {
858 pmatch[reg_idx].rm_so =
859 (pmatch[reg_idx].rm_so == mctx.input.valid_len
860 ? mctx.input.valid_raw_len
861 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
862 pmatch[reg_idx].rm_eo =
863 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
864 ? mctx.input.valid_raw_len
865 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
866 }
867 pmatch[reg_idx].rm_so += match_first;
868 pmatch[reg_idx].rm_eo += match_first;
869 }
870 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
871 {
872 pmatch[nmatch + reg_idx].rm_so = -1;
873 pmatch[nmatch + reg_idx].rm_eo = -1;
874 }
875
876 if (dfa->subexp_map)
877 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
878 if (dfa->subexp_map[reg_idx] != reg_idx)
879 {
880 pmatch[reg_idx + 1].rm_so
881 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
882 pmatch[reg_idx + 1].rm_eo
883 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
884 }
885 }
886
887 free_return:
888 re_free (mctx.state_log);
889 if (dfa->nbackref)
890 match_ctx_free (&mctx);
891 re_string_destruct (&mctx.input);
892 return err;
893}
894
895static reg_errcode_t
896__attribute_warn_unused_result__
897prune_impossible_nodes (re_match_context_t *mctx)
898{
899 const re_dfa_t *const dfa = mctx->dfa;
900 Idx halt_node, match_last;
901 reg_errcode_t ret;
902 re_dfastate_t **sifted_states;
903 re_dfastate_t **lim_states = NULL;
904 re_sift_context_t sctx;
905 DEBUG_ASSERT (mctx->state_log != NULL);
906 match_last = mctx->match_last;
907 halt_node = mctx->last_node;
908
909 /* Avoid overflow. */
910 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
911 <= match_last))
912 return REG_ESPACE;
913
914 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
915 if (__glibc_unlikely (sifted_states == NULL))
916 {
917 ret = REG_ESPACE;
918 goto free_return;
919 }
920 if (dfa->nbackref)
921 {
922 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
923 if (__glibc_unlikely (lim_states == NULL))
924 {
925 ret = REG_ESPACE;
926 goto free_return;
927 }
928 while (1)
929 {
930 memset (lim_states, '\0',
931 sizeof (re_dfastate_t *) * (match_last + 1));
932 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
933 match_last);
934 ret = sift_states_backward (mctx, &sctx);
935 re_node_set_free (&sctx.limits);
936 if (__glibc_unlikely (ret != REG_NOERROR))
937 goto free_return;
938 if (sifted_states[0] != NULL || lim_states[0] != NULL)
939 break;
940 do
941 {
942 --match_last;
943 if (match_last < 0)
944 {
945 ret = REG_NOMATCH;
946 goto free_return;
947 }
948 } while (mctx->state_log[match_last] == NULL
949 || !mctx->state_log[match_last]->halt);
950 halt_node = check_halt_state_context (mctx,
951 mctx->state_log[match_last],
952 match_last);
953 }
954 ret = merge_state_array (dfa, sifted_states, lim_states,
955 match_last + 1);
956 re_free (lim_states);
957 lim_states = NULL;
958 if (__glibc_unlikely (ret != REG_NOERROR))
959 goto free_return;
960 }
961 else
962 {
963 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
964 ret = sift_states_backward (mctx, &sctx);
965 re_node_set_free (&sctx.limits);
966 if (__glibc_unlikely (ret != REG_NOERROR))
967 goto free_return;
968 if (sifted_states[0] == NULL)
969 {
970 ret = REG_NOMATCH;
971 goto free_return;
972 }
973 }
974 re_free (mctx->state_log);
975 mctx->state_log = sifted_states;
976 sifted_states = NULL;
977 mctx->last_node = halt_node;
978 mctx->match_last = match_last;
979 ret = REG_NOERROR;
980 free_return:
981 re_free (sifted_states);
982 re_free (lim_states);
983 return ret;
984}
985
986/* Acquire an initial state and return it.
987 We must select appropriate initial state depending on the context,
988 since initial states may have constraints like "\<", "^", etc.. */
989
990static __always_inline re_dfastate_t *
991acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
992 Idx idx)
993{
994 const re_dfa_t *const dfa = mctx->dfa;
995 if (dfa->init_state->has_constraint)
996 {
997 unsigned int context;
998 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
999 if (IS_WORD_CONTEXT (context))
1000 return dfa->init_state_word;
1001 else if (IS_ORDINARY_CONTEXT (context))
1002 return dfa->init_state;
1003 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1004 return dfa->init_state_begbuf;
1005 else if (IS_NEWLINE_CONTEXT (context))
1006 return dfa->init_state_nl;
1007 else if (IS_BEGBUF_CONTEXT (context))
1008 {
1009 /* It is relatively rare case, then calculate on demand. */
1010 return re_acquire_state_context (err, dfa,
1011 dfa->init_state->entrance_nodes,
1012 context);
1013 }
1014 else
1015 /* Must not happen? */
1016 return dfa->init_state;
1017 }
1018 else
1019 return dfa->init_state;
1020}
1021
1022/* Check whether the regular expression match input string INPUT or not,
1023 and return the index where the matching end. Return -1 if
1024 there is no match, and return -2 in case of an error.
1025 FL_LONGEST_MATCH means we want the POSIX longest matching.
1026 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1027 next place where we may want to try matching.
1028 Note that the matcher assumes that the matching starts from the current
1029 index of the buffer. */
1030
1031static Idx
1032__attribute_warn_unused_result__
1033check_matching (re_match_context_t *mctx, bool fl_longest_match,
1034 Idx *p_match_first)
1035{
1036 const re_dfa_t *const dfa = mctx->dfa;
1037 reg_errcode_t err;
1038 Idx match = 0;
1039 Idx match_last = -1;
1040 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1041 re_dfastate_t *cur_state;
1042 bool at_init_state = p_match_first != NULL;
1043 Idx next_start_idx = cur_str_idx;
1044
1045 err = REG_NOERROR;
1046 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1047 /* An initial state must not be NULL (invalid). */
1048 if (__glibc_unlikely (cur_state == NULL))
1049 {
1050 DEBUG_ASSERT (err == REG_ESPACE);
1051 return -2;
1052 }
1053
1054 if (mctx->state_log != NULL)
1055 {
1056 mctx->state_log[cur_str_idx] = cur_state;
1057
1058 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1059 later. E.g. Processing back references. */
1060 if (__glibc_unlikely (dfa->nbackref))
1061 {
1062 at_init_state = false;
1063 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1064 if (__glibc_unlikely (err != REG_NOERROR))
1065 return err;
1066
1067 if (cur_state->has_backref)
1068 {
1069 err = transit_state_bkref (mctx, &cur_state->nodes);
1070 if (__glibc_unlikely (err != REG_NOERROR))
1071 return err;
1072 }
1073 }
1074 }
1075
1076 /* If the RE accepts NULL string. */
1077 if (__glibc_unlikely (cur_state->halt))
1078 {
1079 if (!cur_state->has_constraint
1080 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1081 {
1082 if (!fl_longest_match)
1083 return cur_str_idx;
1084 else
1085 {
1086 match_last = cur_str_idx;
1087 match = 1;
1088 }
1089 }
1090 }
1091
1092 while (!re_string_eoi (&mctx->input))
1093 {
1094 re_dfastate_t *old_state = cur_state;
1095 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1096
1097 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1098 && mctx->input.bufs_len < mctx->input.len)
1099 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1100 && mctx->input.valid_len < mctx->input.len))
1101 {
1102 err = extend_buffers (mctx, next_char_idx + 1);
1103 if (__glibc_unlikely (err != REG_NOERROR))
1104 {
1105 DEBUG_ASSERT (err == REG_ESPACE);
1106 return -2;
1107 }
1108 }
1109
1110 cur_state = transit_state (&err, mctx, cur_state);
1111 if (mctx->state_log != NULL)
1112 cur_state = merge_state_with_log (&err, mctx, cur_state);
1113
1114 if (cur_state == NULL)
1115 {
1116 /* Reached the invalid state or an error. Try to recover a valid
1117 state using the state log, if available and if we have not
1118 already found a valid (even if not the longest) match. */
1119 if (__glibc_unlikely (err != REG_NOERROR))
1120 return -2;
1121
1122 if (mctx->state_log == NULL
1123 || (match && !fl_longest_match)
1124 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1125 break;
1126 }
1127
1128 if (__glibc_unlikely (at_init_state))
1129 {
1130 if (old_state == cur_state)
1131 next_start_idx = next_char_idx;
1132 else
1133 at_init_state = false;
1134 }
1135
1136 if (cur_state->halt)
1137 {
1138 /* Reached a halt state.
1139 Check the halt state can satisfy the current context. */
1140 if (!cur_state->has_constraint
1141 || check_halt_state_context (mctx, cur_state,
1142 re_string_cur_idx (&mctx->input)))
1143 {
1144 /* We found an appropriate halt state. */
1145 match_last = re_string_cur_idx (&mctx->input);
1146 match = 1;
1147
1148 /* We found a match, do not modify match_first below. */
1149 p_match_first = NULL;
1150 if (!fl_longest_match)
1151 break;
1152 }
1153 }
1154 }
1155
1156 if (p_match_first)
1157 *p_match_first += next_start_idx;
1158
1159 return match_last;
1160}
1161
1162/* Check NODE match the current context. */
1163
1164static bool
1165check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1166{
1167 re_token_type_t type = dfa->nodes[node].type;
1168 unsigned int constraint = dfa->nodes[node].constraint;
1169 if (type != END_OF_RE)
1170 return false;
1171 if (!constraint)
1172 return true;
1173 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1174 return false;
1175 return true;
1176}
1177
1178/* Check the halt state STATE match the current context.
1179 Return 0 if not match, if the node, STATE has, is a halt node and
1180 match the context, return the node. */
1181
1182static Idx
1183check_halt_state_context (const re_match_context_t *mctx,
1184 const re_dfastate_t *state, Idx idx)
1185{
1186 Idx i;
1187 unsigned int context;
1188 DEBUG_ASSERT (state->halt);
1189 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1190 for (i = 0; i < state->nodes.nelem; ++i)
1191 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1192 return state->nodes.elems[i];
1193 return 0;
1194}
1195
1196/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1197 corresponding to the DFA).
1198 Return the destination node, and update EPS_VIA_NODES;
1199 return -1 on match failure, -2 on error. */
1200
1201static Idx
1202proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1203 regmatch_t *prevregs,
1204 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1205 struct re_fail_stack_t *fs)
1206{
1207 const re_dfa_t *const dfa = mctx->dfa;
1208 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1209 {
1210 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1211 re_node_set *edests = &dfa->edests[node];
1212
1213 if (! re_node_set_contains (eps_via_nodes, node))
1214 {
1215 bool ok = re_node_set_insert (eps_via_nodes, node);
1216 if (__glibc_unlikely (! ok))
1217 return -2;
1218 }
1219
1220 /* Pick a valid destination, or return -1 if none is found. */
1221 Idx dest_node = -1;
1222 for (Idx i = 0; i < edests->nelem; i++)
1223 {
1224 Idx candidate = edests->elems[i];
1225 if (!re_node_set_contains (cur_nodes, candidate))
1226 continue;
1227 if (dest_node == -1)
1228 dest_node = candidate;
1229
1230 else
1231 {
1232 /* In order to avoid infinite loop like "(a*)*", return the second
1233 epsilon-transition if the first was already considered. */
1234 if (re_node_set_contains (eps_via_nodes, dest_node))
1235 return candidate;
1236
1237 /* Otherwise, push the second epsilon-transition on the fail stack. */
1238 else if (fs != NULL
1239 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1240 prevregs, eps_via_nodes))
1241 return -2;
1242
1243 /* We know we are going to exit. */
1244 break;
1245 }
1246 }
1247 return dest_node;
1248 }
1249 else
1250 {
1251 Idx naccepted = 0;
1252 re_token_type_t type = dfa->nodes[node].type;
1253
1254 if (dfa->nodes[node].accept_mb)
1255 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1256 else if (type == OP_BACK_REF)
1257 {
1258 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1259 if (subexp_idx < nregs)
1260 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1261 if (fs != NULL)
1262 {
1263 if (subexp_idx >= nregs
1264 || regs[subexp_idx].rm_so == -1
1265 || regs[subexp_idx].rm_eo == -1)
1266 return -1;
1267 else if (naccepted)
1268 {
1269 char *buf = (char *) re_string_get_buffer (&mctx->input);
1270 if (mctx->input.valid_len - *pidx < naccepted
1271 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1272 naccepted)
1273 != 0))
1274 return -1;
1275 }
1276 }
1277
1278 if (naccepted == 0)
1279 {
1280 Idx dest_node;
1281 bool ok = re_node_set_insert (eps_via_nodes, node);
1282 if (__glibc_unlikely (! ok))
1283 return -2;
1284 dest_node = dfa->edests[node].elems[0];
1285 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1286 dest_node))
1287 return dest_node;
1288 }
1289 }
1290
1291 if (naccepted != 0
1292 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1293 {
1294 Idx dest_node = dfa->nexts[node];
1295 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1296 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1297 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1298 dest_node)))
1299 return -1;
1300 re_node_set_empty (eps_via_nodes);
1301 return dest_node;
1302 }
1303 }
1304 return -1;
1305}
1306
1307static reg_errcode_t
1308__attribute_warn_unused_result__
1309push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1310 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1311 re_node_set *eps_via_nodes)
1312{
1313 reg_errcode_t err;
1314 Idx num = fs->num;
1315 if (num == fs->alloc)
1316 {
1317 struct re_fail_stack_ent_t *new_array;
1318 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1319 fs->alloc * 2);
1320 if (new_array == NULL)
1321 return REG_ESPACE;
1322 fs->alloc *= 2;
1323 fs->stack = new_array;
1324 }
1325 fs->stack[num].idx = str_idx;
1326 fs->stack[num].node = dest_node;
1327 fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1328 if (fs->stack[num].regs == NULL)
1329 return REG_ESPACE;
1330 fs->num = num + 1;
1331 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1332 memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1333 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1334 return err;
1335}
1336
1337static Idx
1338pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1339 regmatch_t *regs, regmatch_t *prevregs,
1340 re_node_set *eps_via_nodes)
1341{
1342 if (fs == NULL || fs->num == 0)
1343 return -1;
1344 Idx num = --fs->num;
1345 *pidx = fs->stack[num].idx;
1346 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1347 memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1348 re_node_set_free (eps_via_nodes);
1349 re_free (fs->stack[num].regs);
1350 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1351 DEBUG_ASSERT (0 <= fs->stack[num].node);
1352 return fs->stack[num].node;
1353}
1354
1355
1356#define DYNARRAY_STRUCT regmatch_list
1357#define DYNARRAY_ELEMENT regmatch_t
1358#define DYNARRAY_PREFIX regmatch_list_
1359#include <malloc/dynarray-skeleton.c>
1360
1361/* Set the positions where the subexpressions are starts/ends to registers
1362 PMATCH.
1363 Note: We assume that pmatch[0] is already set, and
1364 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1365
1366static reg_errcode_t
1367__attribute_warn_unused_result__
1368set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1369 regmatch_t *pmatch, bool fl_backtrack)
1370{
1371 const re_dfa_t *dfa = preg->buffer;
1372 Idx idx, cur_node;
1373 re_node_set eps_via_nodes;
1374 struct re_fail_stack_t *fs;
1375 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1376 struct regmatch_list prev_match;
1377 regmatch_list_init (&prev_match);
1378
1379 DEBUG_ASSERT (nmatch > 1);
1380 DEBUG_ASSERT (mctx->state_log != NULL);
1381 if (fl_backtrack)
1382 {
1383 fs = &fs_body;
1384 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1385 if (fs->stack == NULL)
1386 return REG_ESPACE;
1387 }
1388 else
1389 fs = NULL;
1390
1391 cur_node = dfa->init_node;
1392 re_node_set_init_empty (&eps_via_nodes);
1393
1394 if (!regmatch_list_resize (&prev_match, nmatch))
1395 {
1396 regmatch_list_free (&prev_match);
1397 free_fail_stack_return (fs);
1398 return REG_ESPACE;
1399 }
1400 regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1401 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1402
1403 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1404 {
1405 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1406
1407 if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1408 || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1409 {
1410 Idx reg_idx;
1411 cur_node = -1;
1412 if (fs)
1413 {
1414 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1415 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1416 {
1417 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1418 prev_idx_match, &eps_via_nodes);
1419 break;
1420 }
1421 }
1422 if (cur_node < 0)
1423 {
1424 re_node_set_free (&eps_via_nodes);
1425 regmatch_list_free (&prev_match);
1426 return free_fail_stack_return (fs);
1427 }
1428 }
1429
1430 /* Proceed to next node. */
1431 cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1432 &idx, cur_node,
1433 &eps_via_nodes, fs);
1434
1435 if (__glibc_unlikely (cur_node < 0))
1436 {
1437 if (__glibc_unlikely (cur_node == -2))
1438 {
1439 re_node_set_free (&eps_via_nodes);
1440 regmatch_list_free (&prev_match);
1441 free_fail_stack_return (fs);
1442 return REG_ESPACE;
1443 }
1444 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1445 prev_idx_match, &eps_via_nodes);
1446 if (cur_node < 0)
1447 {
1448 re_node_set_free (&eps_via_nodes);
1449 regmatch_list_free (&prev_match);
1450 free_fail_stack_return (fs);
1451 return REG_NOMATCH;
1452 }
1453 }
1454 }
1455 re_node_set_free (&eps_via_nodes);
1456 regmatch_list_free (&prev_match);
1457 return free_fail_stack_return (fs);
1458}
1459
1460static reg_errcode_t
1461free_fail_stack_return (struct re_fail_stack_t *fs)
1462{
1463 if (fs)
1464 {
1465 Idx fs_idx;
1466 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1467 {
1468 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1469 re_free (fs->stack[fs_idx].regs);
1470 }
1471 re_free (fs->stack);
1472 }
1473 return REG_NOERROR;
1474}
1475
1476static void
1477update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1478 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1479{
1480 int type = dfa->nodes[cur_node].type;
1481 if (type == OP_OPEN_SUBEXP)
1482 {
1483 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1484
1485 /* We are at the first node of this sub expression. */
1486 if (reg_num < nmatch)
1487 {
1488 pmatch[reg_num].rm_so = cur_idx;
1489 pmatch[reg_num].rm_eo = -1;
1490 }
1491 }
1492 else if (type == OP_CLOSE_SUBEXP)
1493 {
1494 /* We are at the last node of this sub expression. */
1495 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1496 if (reg_num < nmatch)
1497 {
1498 if (pmatch[reg_num].rm_so < cur_idx)
1499 {
1500 pmatch[reg_num].rm_eo = cur_idx;
1501 /* This is a non-empty match or we are not inside an optional
1502 subexpression. Accept this right away. */
1503 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1504 }
1505 else
1506 {
1507 if (dfa->nodes[cur_node].opt_subexp
1508 && prev_idx_match[reg_num].rm_so != -1)
1509 /* We transited through an empty match for an optional
1510 subexpression, like (a?)*, and this is not the subexp's
1511 first match. Copy back the old content of the registers
1512 so that matches of an inner subexpression are undone as
1513 well, like in ((a?))*. */
1514 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1515 else
1516 /* We completed a subexpression, but it may be part of
1517 an optional one, so do not update PREV_IDX_MATCH. */
1518 pmatch[reg_num].rm_eo = cur_idx;
1519 }
1520 }
1521 }
1522}
1523
1524/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1525 and sift the nodes in each states according to the following rules.
1526 Updated state_log will be wrote to STATE_LOG.
1527
1528 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1529 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1530 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1531 the LAST_NODE, we throw away the node 'a'.
1532 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1533 string 's' and transit to 'b':
1534 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1535 away the node 'a'.
1536 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1537 thrown away, we throw away the node 'a'.
1538 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1539 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1540 node 'a'.
1541 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1542 we throw away the node 'a'. */
1543
1544#define STATE_NODE_CONTAINS(state,node) \
1545 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1546
1547static reg_errcode_t
1548sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1549{
1550 reg_errcode_t err;
1551 int null_cnt = 0;
1552 Idx str_idx = sctx->last_str_idx;
1553 re_node_set cur_dest;
1554
1555 DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1556
1557 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1558 transit to the last_node and the last_node itself. */
1559 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1560 if (__glibc_unlikely (err != REG_NOERROR))
1561 return err;
1562 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1563 if (__glibc_unlikely (err != REG_NOERROR))
1564 goto free_return;
1565
1566 /* Then check each states in the state_log. */
1567 while (str_idx > 0)
1568 {
1569 /* Update counters. */
1570 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1571 if (null_cnt > mctx->max_mb_elem_len)
1572 {
1573 memset (sctx->sifted_states, '\0',
1574 sizeof (re_dfastate_t *) * str_idx);
1575 re_node_set_free (&cur_dest);
1576 return REG_NOERROR;
1577 }
1578 re_node_set_empty (&cur_dest);
1579 --str_idx;
1580
1581 if (mctx->state_log[str_idx])
1582 {
1583 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1584 if (__glibc_unlikely (err != REG_NOERROR))
1585 goto free_return;
1586 }
1587
1588 /* Add all the nodes which satisfy the following conditions:
1589 - It can epsilon transit to a node in CUR_DEST.
1590 - It is in CUR_SRC.
1591 And update state_log. */
1592 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1593 if (__glibc_unlikely (err != REG_NOERROR))
1594 goto free_return;
1595 }
1596 err = REG_NOERROR;
1597 free_return:
1598 re_node_set_free (&cur_dest);
1599 return err;
1600}
1601
1602static reg_errcode_t
1603__attribute_warn_unused_result__
1604build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1605 Idx str_idx, re_node_set *cur_dest)
1606{
1607 const re_dfa_t *const dfa = mctx->dfa;
1608 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1609 Idx i;
1610
1611 /* Then build the next sifted state.
1612 We build the next sifted state on 'cur_dest', and update
1613 'sifted_states[str_idx]' with 'cur_dest'.
1614 Note:
1615 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1616 'cur_src' points the node_set of the old 'state_log[str_idx]'
1617 (with the epsilon nodes pre-filtered out). */
1618 for (i = 0; i < cur_src->nelem; i++)
1619 {
1620 Idx prev_node = cur_src->elems[i];
1621 int naccepted = 0;
1622 bool ok;
1623 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1624
1625 /* If the node may accept "multi byte". */
1626 if (dfa->nodes[prev_node].accept_mb)
1627 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1628 str_idx, sctx->last_str_idx);
1629
1630 /* We don't check backreferences here.
1631 See update_cur_sifted_state(). */
1632 if (!naccepted
1633 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1634 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1635 dfa->nexts[prev_node]))
1636 naccepted = 1;
1637
1638 if (naccepted == 0)
1639 continue;
1640
1641 if (sctx->limits.nelem)
1642 {
1643 Idx to_idx = str_idx + naccepted;
1644 if (check_dst_limits (mctx, &sctx->limits,
1645 dfa->nexts[prev_node], to_idx,
1646 prev_node, str_idx))
1647 continue;
1648 }
1649 ok = re_node_set_insert (cur_dest, prev_node);
1650 if (__glibc_unlikely (! ok))
1651 return REG_ESPACE;
1652 }
1653
1654 return REG_NOERROR;
1655}
1656
1657/* Helper functions. */
1658
1659static reg_errcode_t
1660clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1661{
1662 Idx top = mctx->state_log_top;
1663
1664 if ((next_state_log_idx >= mctx->input.bufs_len
1665 && mctx->input.bufs_len < mctx->input.len)
1666 || (next_state_log_idx >= mctx->input.valid_len
1667 && mctx->input.valid_len < mctx->input.len))
1668 {
1669 reg_errcode_t err;
1670 err = extend_buffers (mctx, next_state_log_idx + 1);
1671 if (__glibc_unlikely (err != REG_NOERROR))
1672 return err;
1673 }
1674
1675 if (top < next_state_log_idx)
1676 {
1677 DEBUG_ASSERT (mctx->state_log != NULL);
1678 memset (mctx->state_log + top + 1, '\0',
1679 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1680 mctx->state_log_top = next_state_log_idx;
1681 }
1682 return REG_NOERROR;
1683}
1684
1685static reg_errcode_t
1686merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1687 re_dfastate_t **src, Idx num)
1688{
1689 Idx st_idx;
1690 reg_errcode_t err;
1691 for (st_idx = 0; st_idx < num; ++st_idx)
1692 {
1693 if (dst[st_idx] == NULL)
1694 dst[st_idx] = src[st_idx];
1695 else if (src[st_idx] != NULL)
1696 {
1697 re_node_set merged_set;
1698 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1699 &src[st_idx]->nodes);
1700 if (__glibc_unlikely (err != REG_NOERROR))
1701 return err;
1702 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1703 re_node_set_free (&merged_set);
1704 if (__glibc_unlikely (err != REG_NOERROR))
1705 return err;
1706 }
1707 }
1708 return REG_NOERROR;
1709}
1710
1711static reg_errcode_t
1712update_cur_sifted_state (const re_match_context_t *mctx,
1713 re_sift_context_t *sctx, Idx str_idx,
1714 re_node_set *dest_nodes)
1715{
1716 const re_dfa_t *const dfa = mctx->dfa;
1717 reg_errcode_t err = REG_NOERROR;
1718 const re_node_set *candidates;
1719 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1720 : &mctx->state_log[str_idx]->nodes);
1721
1722 if (dest_nodes->nelem == 0)
1723 sctx->sifted_states[str_idx] = NULL;
1724 else
1725 {
1726 if (candidates)
1727 {
1728 /* At first, add the nodes which can epsilon transit to a node in
1729 DEST_NODE. */
1730 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1731 if (__glibc_unlikely (err != REG_NOERROR))
1732 return err;
1733
1734 /* Then, check the limitations in the current sift_context. */
1735 if (sctx->limits.nelem)
1736 {
1737 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1738 mctx->bkref_ents, str_idx);
1739 if (__glibc_unlikely (err != REG_NOERROR))
1740 return err;
1741 }
1742 }
1743
1744 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1745 if (__glibc_unlikely (err != REG_NOERROR))
1746 return err;
1747 }
1748
1749 if (candidates && mctx->state_log[str_idx]->has_backref)
1750 {
1751 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1752 if (__glibc_unlikely (err != REG_NOERROR))
1753 return err;
1754 }
1755 return REG_NOERROR;
1756}
1757
1758static reg_errcode_t
1759__attribute_warn_unused_result__
1760add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1761 const re_node_set *candidates)
1762{
1763 reg_errcode_t err = REG_NOERROR;
1764 Idx i;
1765
1766 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1767 if (__glibc_unlikely (err != REG_NOERROR))
1768 return err;
1769
1770 if (!state->inveclosure.alloc)
1771 {
1772 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1773 if (__glibc_unlikely (err != REG_NOERROR))
1774 return REG_ESPACE;
1775 for (i = 0; i < dest_nodes->nelem; i++)
1776 {
1777 err = re_node_set_merge (&state->inveclosure,
1778 dfa->inveclosures + dest_nodes->elems[i]);
1779 if (__glibc_unlikely (err != REG_NOERROR))
1780 return REG_ESPACE;
1781 }
1782 }
1783 return re_node_set_add_intersect (dest_nodes, candidates,
1784 &state->inveclosure);
1785}
1786
1787static reg_errcode_t
1788sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1789 const re_node_set *candidates)
1790{
1791 Idx ecl_idx;
1792 reg_errcode_t err;
1793 re_node_set *inv_eclosure = dfa->inveclosures + node;
1794 re_node_set except_nodes;
1795 re_node_set_init_empty (&except_nodes);
1796 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1797 {
1798 Idx cur_node = inv_eclosure->elems[ecl_idx];
1799 if (cur_node == node)
1800 continue;
1801 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1802 {
1803 Idx edst1 = dfa->edests[cur_node].elems[0];
1804 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1805 ? dfa->edests[cur_node].elems[1] : -1);
1806 if ((!re_node_set_contains (inv_eclosure, edst1)
1807 && re_node_set_contains (dest_nodes, edst1))
1808 || (edst2 > 0
1809 && !re_node_set_contains (inv_eclosure, edst2)
1810 && re_node_set_contains (dest_nodes, edst2)))
1811 {
1812 err = re_node_set_add_intersect (&except_nodes, candidates,
1813 dfa->inveclosures + cur_node);
1814 if (__glibc_unlikely (err != REG_NOERROR))
1815 {
1816 re_node_set_free (&except_nodes);
1817 return err;
1818 }
1819 }
1820 }
1821 }
1822 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823 {
1824 Idx cur_node = inv_eclosure->elems[ecl_idx];
1825 if (!re_node_set_contains (&except_nodes, cur_node))
1826 {
1827 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1828 re_node_set_remove_at (dest_nodes, idx);
1829 }
1830 }
1831 re_node_set_free (&except_nodes);
1832 return REG_NOERROR;
1833}
1834
1835static bool
1836check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1837 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1838{
1839 const re_dfa_t *const dfa = mctx->dfa;
1840 Idx lim_idx, src_pos, dst_pos;
1841
1842 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1843 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1844 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1845 {
1846 Idx subexp_idx;
1847 struct re_backref_cache_entry *ent;
1848 ent = mctx->bkref_ents + limits->elems[lim_idx];
1849 subexp_idx = dfa->nodes[ent->node].opr.idx;
1850
1851 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1852 subexp_idx, dst_node, dst_idx,
1853 dst_bkref_idx);
1854 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1855 subexp_idx, src_node, src_idx,
1856 src_bkref_idx);
1857
1858 /* In case of:
1859 <src> <dst> ( <subexp> )
1860 ( <subexp> ) <src> <dst>
1861 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1862 if (src_pos == dst_pos)
1863 continue; /* This is unrelated limitation. */
1864 else
1865 return true;
1866 }
1867 return false;
1868}
1869
1870static int
1871check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1872 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1873{
1874 const re_dfa_t *const dfa = mctx->dfa;
1875 const re_node_set *eclosures = dfa->eclosures + from_node;
1876 Idx node_idx;
1877
1878 /* Else, we are on the boundary: examine the nodes on the epsilon
1879 closure. */
1880 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1881 {
1882 Idx node = eclosures->elems[node_idx];
1883 switch (dfa->nodes[node].type)
1884 {
1885 case OP_BACK_REF:
1886 if (bkref_idx != -1)
1887 {
1888 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1889 do
1890 {
1891 Idx dst;
1892 int cpos;
1893
1894 if (ent->node != node)
1895 continue;
1896
1897 if (subexp_idx < BITSET_WORD_BITS
1898 && !(ent->eps_reachable_subexps_map
1899 & ((bitset_word_t) 1 << subexp_idx)))
1900 continue;
1901
1902 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1903 OP_CLOSE_SUBEXP cases below. But, if the
1904 destination node is the same node as the source
1905 node, don't recurse because it would cause an
1906 infinite loop: a regex that exhibits this behavior
1907 is ()\1*\1* */
1908 dst = dfa->edests[node].elems[0];
1909 if (dst == from_node)
1910 {
1911 if (boundaries & 1)
1912 return -1;
1913 else /* if (boundaries & 2) */
1914 return 0;
1915 }
1916
1917 cpos =
1918 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1919 dst, bkref_idx);
1920 if (cpos == -1 /* && (boundaries & 1) */)
1921 return -1;
1922 if (cpos == 0 && (boundaries & 2))
1923 return 0;
1924
1925 if (subexp_idx < BITSET_WORD_BITS)
1926 ent->eps_reachable_subexps_map
1927 &= ~((bitset_word_t) 1 << subexp_idx);
1928 }
1929 while (ent++->more);
1930 }
1931 break;
1932
1933 case OP_OPEN_SUBEXP:
1934 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1935 return -1;
1936 break;
1937
1938 case OP_CLOSE_SUBEXP:
1939 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1940 return 0;
1941 break;
1942
1943 default:
1944 break;
1945 }
1946 }
1947
1948 return (boundaries & 2) ? 1 : 0;
1949}
1950
1951static int
1952check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1953 Idx subexp_idx, Idx from_node, Idx str_idx,
1954 Idx bkref_idx)
1955{
1956 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1957 int boundaries;
1958
1959 /* If we are outside the range of the subexpression, return -1 or 1. */
1960 if (str_idx < lim->subexp_from)
1961 return -1;
1962
1963 if (lim->subexp_to < str_idx)
1964 return 1;
1965
1966 /* If we are within the subexpression, return 0. */
1967 boundaries = (str_idx == lim->subexp_from);
1968 boundaries |= (str_idx == lim->subexp_to) << 1;
1969 if (boundaries == 0)
1970 return 0;
1971
1972 /* Else, examine epsilon closure. */
1973 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1974 from_node, bkref_idx);
1975}
1976
1977/* Check the limitations of sub expressions LIMITS, and remove the nodes
1978 which are against limitations from DEST_NODES. */
1979
1980static reg_errcode_t
1981check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1982 const re_node_set *candidates, re_node_set *limits,
1983 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1984{
1985 reg_errcode_t err;
1986 Idx node_idx, lim_idx;
1987
1988 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1989 {
1990 Idx subexp_idx;
1991 struct re_backref_cache_entry *ent;
1992 ent = bkref_ents + limits->elems[lim_idx];
1993
1994 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1995 continue; /* This is unrelated limitation. */
1996
1997 subexp_idx = dfa->nodes[ent->node].opr.idx;
1998 if (ent->subexp_to == str_idx)
1999 {
2000 Idx ops_node = -1;
2001 Idx cls_node = -1;
2002 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2003 {
2004 Idx node = dest_nodes->elems[node_idx];
2005 re_token_type_t type = dfa->nodes[node].type;
2006 if (type == OP_OPEN_SUBEXP
2007 && subexp_idx == dfa->nodes[node].opr.idx)
2008 ops_node = node;
2009 else if (type == OP_CLOSE_SUBEXP
2010 && subexp_idx == dfa->nodes[node].opr.idx)
2011 cls_node = node;
2012 }
2013
2014 /* Check the limitation of the open subexpression. */
2015 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2016 if (ops_node >= 0)
2017 {
2018 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2019 candidates);
2020 if (__glibc_unlikely (err != REG_NOERROR))
2021 return err;
2022 }
2023
2024 /* Check the limitation of the close subexpression. */
2025 if (cls_node >= 0)
2026 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2027 {
2028 Idx node = dest_nodes->elems[node_idx];
2029 if (!re_node_set_contains (dfa->inveclosures + node,
2030 cls_node)
2031 && !re_node_set_contains (dfa->eclosures + node,
2032 cls_node))
2033 {
2034 /* It is against this limitation.
2035 Remove it form the current sifted state. */
2036 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2037 candidates);
2038 if (__glibc_unlikely (err != REG_NOERROR))
2039 return err;
2040 --node_idx;
2041 }
2042 }
2043 }
2044 else /* (ent->subexp_to != str_idx) */
2045 {
2046 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2047 {
2048 Idx node = dest_nodes->elems[node_idx];
2049 re_token_type_t type = dfa->nodes[node].type;
2050 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2051 {
2052 if (subexp_idx != dfa->nodes[node].opr.idx)
2053 continue;
2054 /* It is against this limitation.
2055 Remove it form the current sifted state. */
2056 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2057 candidates);
2058 if (__glibc_unlikely (err != REG_NOERROR))
2059 return err;
2060 }
2061 }
2062 }
2063 }
2064 return REG_NOERROR;
2065}
2066
2067static reg_errcode_t
2068__attribute_warn_unused_result__
2069sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2070 Idx str_idx, const re_node_set *candidates)
2071{
2072 const re_dfa_t *const dfa = mctx->dfa;
2073 reg_errcode_t err;
2074 Idx node_idx, node;
2075 re_sift_context_t local_sctx;
2076 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2077
2078 if (first_idx == -1)
2079 return REG_NOERROR;
2080
2081 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2082
2083 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2084 {
2085 Idx enabled_idx;
2086 re_token_type_t type;
2087 struct re_backref_cache_entry *entry;
2088 node = candidates->elems[node_idx];
2089 type = dfa->nodes[node].type;
2090 /* Avoid infinite loop for the REs like "()\1+". */
2091 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2092 continue;
2093 if (type != OP_BACK_REF)
2094 continue;
2095
2096 entry = mctx->bkref_ents + first_idx;
2097 enabled_idx = first_idx;
2098 do
2099 {
2100 Idx subexp_len;
2101 Idx to_idx;
2102 Idx dst_node;
2103 bool ok;
2104 re_dfastate_t *cur_state;
2105
2106 if (entry->node != node)
2107 continue;
2108 subexp_len = entry->subexp_to - entry->subexp_from;
2109 to_idx = str_idx + subexp_len;
2110 dst_node = (subexp_len ? dfa->nexts[node]
2111 : dfa->edests[node].elems[0]);
2112
2113 if (to_idx > sctx->last_str_idx
2114 || sctx->sifted_states[to_idx] == NULL
2115 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2116 || check_dst_limits (mctx, &sctx->limits, node,
2117 str_idx, dst_node, to_idx))
2118 continue;
2119
2120 if (local_sctx.sifted_states == NULL)
2121 {
2122 local_sctx = *sctx;
2123 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2124 if (__glibc_unlikely (err != REG_NOERROR))
2125 goto free_return;
2126 }
2127 local_sctx.last_node = node;
2128 local_sctx.last_str_idx = str_idx;
2129 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2130 if (__glibc_unlikely (! ok))
2131 {
2132 err = REG_ESPACE;
2133 goto free_return;
2134 }
2135 cur_state = local_sctx.sifted_states[str_idx];
2136 err = sift_states_backward (mctx, &local_sctx);
2137 if (__glibc_unlikely (err != REG_NOERROR))
2138 goto free_return;
2139 if (sctx->limited_states != NULL)
2140 {
2141 err = merge_state_array (dfa, sctx->limited_states,
2142 local_sctx.sifted_states,
2143 str_idx + 1);
2144 if (__glibc_unlikely (err != REG_NOERROR))
2145 goto free_return;
2146 }
2147 local_sctx.sifted_states[str_idx] = cur_state;
2148 re_node_set_remove (&local_sctx.limits, enabled_idx);
2149
2150 /* mctx->bkref_ents may have changed, reload the pointer. */
2151 entry = mctx->bkref_ents + enabled_idx;
2152 }
2153 while (enabled_idx++, entry++->more);
2154 }
2155 err = REG_NOERROR;
2156 free_return:
2157 if (local_sctx.sifted_states != NULL)
2158 {
2159 re_node_set_free (&local_sctx.limits);
2160 }
2161
2162 return err;
2163}
2164
2165
2166static int
2167sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2168 Idx node_idx, Idx str_idx, Idx max_str_idx)
2169{
2170 const re_dfa_t *const dfa = mctx->dfa;
2171 int naccepted;
2172 /* Check the node can accept "multi byte". */
2173 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2174 if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2175 && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2176 dfa->nexts[node_idx]))
2177 /* The node can't accept the "multi byte", or the
2178 destination was already thrown away, then the node
2179 couldn't accept the current input "multi byte". */
2180 naccepted = 0;
2181 /* Otherwise, it is sure that the node could accept
2182 'naccepted' bytes input. */
2183 return naccepted;
2184}
2185
2186
2187/* Functions for state transition. */
2188
2189/* Return the next state to which the current state STATE will transit by
2190 accepting the current input byte, and update STATE_LOG if necessary.
2191 Return NULL on failure.
2192 If STATE can accept a multibyte char/collating element/back reference
2193 update the destination of STATE_LOG. */
2194
2195static re_dfastate_t *
2196__attribute_warn_unused_result__
2197transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2198 re_dfastate_t *state)
2199{
2200 re_dfastate_t **trtable;
2201 unsigned char ch;
2202
2203 /* If the current state can accept multibyte. */
2204 if (__glibc_unlikely (state->accept_mb))
2205 {
2206 *err = transit_state_mb (mctx, state);
2207 if (__glibc_unlikely (*err != REG_NOERROR))
2208 return NULL;
2209 }
2210
2211 /* Then decide the next state with the single byte. */
2212#if 0
2213 if (0)
2214 /* don't use transition table */
2215 return transit_state_sb (err, mctx, state);
2216#endif
2217
2218 /* Use transition table */
2219 ch = re_string_fetch_byte (&mctx->input);
2220 for (;;)
2221 {
2222 trtable = state->trtable;
2223 if (__glibc_likely (trtable != NULL))
2224 return trtable[ch];
2225
2226 trtable = state->word_trtable;
2227 if (__glibc_likely (trtable != NULL))
2228 {
2229 unsigned int context;
2230 context
2231 = re_string_context_at (&mctx->input,
2232 re_string_cur_idx (&mctx->input) - 1,
2233 mctx->eflags);
2234 if (IS_WORD_CONTEXT (context))
2235 return trtable[ch + SBC_MAX];
2236 else
2237 return trtable[ch];
2238 }
2239
2240 if (!build_trtable (mctx->dfa, state))
2241 {
2242 *err = REG_ESPACE;
2243 return NULL;
2244 }
2245
2246 /* Retry, we now have a transition table. */
2247 }
2248}
2249
2250/* Update the state_log if we need */
2251static re_dfastate_t *
2252merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2253 re_dfastate_t *next_state)
2254{
2255 const re_dfa_t *const dfa = mctx->dfa;
2256 Idx cur_idx = re_string_cur_idx (&mctx->input);
2257
2258 if (cur_idx > mctx->state_log_top)
2259 {
2260 mctx->state_log[cur_idx] = next_state;
2261 mctx->state_log_top = cur_idx;
2262 }
2263 else if (mctx->state_log[cur_idx] == 0)
2264 {
2265 mctx->state_log[cur_idx] = next_state;
2266 }
2267 else
2268 {
2269 re_dfastate_t *pstate;
2270 unsigned int context;
2271 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2272 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2273 the destination of a multibyte char/collating element/
2274 back reference. Then the next state is the union set of
2275 these destinations and the results of the transition table. */
2276 pstate = mctx->state_log[cur_idx];
2277 log_nodes = pstate->entrance_nodes;
2278 if (next_state != NULL)
2279 {
2280 table_nodes = next_state->entrance_nodes;
2281 *err = re_node_set_init_union (&next_nodes, table_nodes,
2282 log_nodes);
2283 if (__glibc_unlikely (*err != REG_NOERROR))
2284 return NULL;
2285 }
2286 else
2287 next_nodes = *log_nodes;
2288 /* Note: We already add the nodes of the initial state,
2289 then we don't need to add them here. */
2290
2291 context = re_string_context_at (&mctx->input,
2292 re_string_cur_idx (&mctx->input) - 1,
2293 mctx->eflags);
2294 next_state = mctx->state_log[cur_idx]
2295 = re_acquire_state_context (err, dfa, &next_nodes, context);
2296 /* We don't need to check errors here, since the return value of
2297 this function is next_state and ERR is already set. */
2298
2299 if (table_nodes != NULL)
2300 re_node_set_free (&next_nodes);
2301 }
2302
2303 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2304 {
2305 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2306 later. We must check them here, since the back references in the
2307 next state might use them. */
2308 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2309 cur_idx);
2310 if (__glibc_unlikely (*err != REG_NOERROR))
2311 return NULL;
2312
2313 /* If the next state has back references. */
2314 if (next_state->has_backref)
2315 {
2316 *err = transit_state_bkref (mctx, &next_state->nodes);
2317 if (__glibc_unlikely (*err != REG_NOERROR))
2318 return NULL;
2319 next_state = mctx->state_log[cur_idx];
2320 }
2321 }
2322
2323 return next_state;
2324}
2325
2326/* Skip bytes in the input that correspond to part of a
2327 multi-byte match, then look in the log for a state
2328 from which to restart matching. */
2329static re_dfastate_t *
2330find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2331{
2332 re_dfastate_t *cur_state;
2333 do
2334 {
2335 Idx max = mctx->state_log_top;
2336 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2337
2338 do
2339 {
2340 if (++cur_str_idx > max)
2341 return NULL;
2342 re_string_skip_bytes (&mctx->input, 1);
2343 }
2344 while (mctx->state_log[cur_str_idx] == NULL);
2345
2346 cur_state = merge_state_with_log (err, mctx, NULL);
2347 }
2348 while (*err == REG_NOERROR && cur_state == NULL);
2349 return cur_state;
2350}
2351
2352/* Helper functions for transit_state. */
2353
2354/* From the node set CUR_NODES, pick up the nodes whose types are
2355 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2356 expression. And register them to use them later for evaluating the
2357 corresponding back references. */
2358
2359static reg_errcode_t
2360check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2361 Idx str_idx)
2362{
2363 const re_dfa_t *const dfa = mctx->dfa;
2364 Idx node_idx;
2365 reg_errcode_t err;
2366
2367 /* TODO: This isn't efficient.
2368 Because there might be more than one nodes whose types are
2369 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2370 nodes.
2371 E.g. RE: (a){2} */
2372 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2373 {
2374 Idx node = cur_nodes->elems[node_idx];
2375 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2376 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2377 && (dfa->used_bkref_map
2378 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2379 {
2380 err = match_ctx_add_subtop (mctx, node, str_idx);
2381 if (__glibc_unlikely (err != REG_NOERROR))
2382 return err;
2383 }
2384 }
2385 return REG_NOERROR;
2386}
2387
2388#if 0
2389/* Return the next state to which the current state STATE will transit by
2390 accepting the current input byte. Return NULL on failure. */
2391
2392static re_dfastate_t *
2393transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2394 re_dfastate_t *state)
2395{
2396 const re_dfa_t *const dfa = mctx->dfa;
2397 re_node_set next_nodes;
2398 re_dfastate_t *next_state;
2399 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2400 unsigned int context;
2401
2402 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2403 if (__glibc_unlikely (*err != REG_NOERROR))
2404 return NULL;
2405 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2406 {
2407 Idx cur_node = state->nodes.elems[node_cnt];
2408 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2409 {
2410 *err = re_node_set_merge (&next_nodes,
2411 dfa->eclosures + dfa->nexts[cur_node]);
2412 if (__glibc_unlikely (*err != REG_NOERROR))
2413 {
2414 re_node_set_free (&next_nodes);
2415 return NULL;
2416 }
2417 }
2418 }
2419 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2420 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2421 /* We don't need to check errors here, since the return value of
2422 this function is next_state and ERR is already set. */
2423
2424 re_node_set_free (&next_nodes);
2425 re_string_skip_bytes (&mctx->input, 1);
2426 return next_state;
2427}
2428#endif
2429
2430static reg_errcode_t
2431transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2432{
2433 const re_dfa_t *const dfa = mctx->dfa;
2434 reg_errcode_t err;
2435 Idx i;
2436
2437 for (i = 0; i < pstate->nodes.nelem; ++i)
2438 {
2439 re_node_set dest_nodes, *new_nodes;
2440 Idx cur_node_idx = pstate->nodes.elems[i];
2441 int naccepted;
2442 Idx dest_idx;
2443 unsigned int context;
2444 re_dfastate_t *dest_state;
2445
2446 if (!dfa->nodes[cur_node_idx].accept_mb)
2447 continue;
2448
2449 if (dfa->nodes[cur_node_idx].constraint)
2450 {
2451 context = re_string_context_at (&mctx->input,
2452 re_string_cur_idx (&mctx->input),
2453 mctx->eflags);
2454 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2455 context))
2456 continue;
2457 }
2458
2459 /* How many bytes the node can accept? */
2460 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2461 re_string_cur_idx (&mctx->input));
2462 if (naccepted == 0)
2463 continue;
2464
2465 /* The node can accepts 'naccepted' bytes. */
2466 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2467 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2468 : mctx->max_mb_elem_len);
2469 err = clean_state_log_if_needed (mctx, dest_idx);
2470 if (__glibc_unlikely (err != REG_NOERROR))
2471 return err;
2472 DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2473 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2474
2475 dest_state = mctx->state_log[dest_idx];
2476 if (dest_state == NULL)
2477 dest_nodes = *new_nodes;
2478 else
2479 {
2480 err = re_node_set_init_union (&dest_nodes,
2481 dest_state->entrance_nodes, new_nodes);
2482 if (__glibc_unlikely (err != REG_NOERROR))
2483 return err;
2484 }
2485 context = re_string_context_at (&mctx->input, dest_idx - 1,
2486 mctx->eflags);
2487 mctx->state_log[dest_idx]
2488 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2489 if (dest_state != NULL)
2490 re_node_set_free (&dest_nodes);
2491 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2492 && err != REG_NOERROR))
2493 return err;
2494 }
2495 return REG_NOERROR;
2496}
2497
2498static reg_errcode_t
2499transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2500{
2501 const re_dfa_t *const dfa = mctx->dfa;
2502 reg_errcode_t err;
2503 Idx i;
2504 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2505
2506 for (i = 0; i < nodes->nelem; ++i)
2507 {
2508 Idx dest_str_idx, prev_nelem, bkc_idx;
2509 Idx node_idx = nodes->elems[i];
2510 unsigned int context;
2511 const re_token_t *node = dfa->nodes + node_idx;
2512 re_node_set *new_dest_nodes;
2513
2514 /* Check whether 'node' is a backreference or not. */
2515 if (node->type != OP_BACK_REF)
2516 continue;
2517
2518 if (node->constraint)
2519 {
2520 context = re_string_context_at (&mctx->input, cur_str_idx,
2521 mctx->eflags);
2522 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2523 continue;
2524 }
2525
2526 /* 'node' is a backreference.
2527 Check the substring which the substring matched. */
2528 bkc_idx = mctx->nbkref_ents;
2529 err = get_subexp (mctx, node_idx, cur_str_idx);
2530 if (__glibc_unlikely (err != REG_NOERROR))
2531 goto free_return;
2532
2533 /* And add the epsilon closures (which is 'new_dest_nodes') of
2534 the backreference to appropriate state_log. */
2535 DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2536 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2537 {
2538 Idx subexp_len;
2539 re_dfastate_t *dest_state;
2540 struct re_backref_cache_entry *bkref_ent;
2541 bkref_ent = mctx->bkref_ents + bkc_idx;
2542 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2543 continue;
2544 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2545 new_dest_nodes = (subexp_len == 0
2546 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2547 : dfa->eclosures + dfa->nexts[node_idx]);
2548 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2549 - bkref_ent->subexp_from);
2550 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2551 mctx->eflags);
2552 dest_state = mctx->state_log[dest_str_idx];
2553 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2554 : mctx->state_log[cur_str_idx]->nodes.nelem);
2555 /* Add 'new_dest_node' to state_log. */
2556 if (dest_state == NULL)
2557 {
2558 mctx->state_log[dest_str_idx]
2559 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2560 context);
2561 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2562 && err != REG_NOERROR))
2563 goto free_return;
2564 }
2565 else
2566 {
2567 re_node_set dest_nodes;
2568 err = re_node_set_init_union (&dest_nodes,
2569 dest_state->entrance_nodes,
2570 new_dest_nodes);
2571 if (__glibc_unlikely (err != REG_NOERROR))
2572 {
2573 re_node_set_free (&dest_nodes);
2574 goto free_return;
2575 }
2576 mctx->state_log[dest_str_idx]
2577 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2578 re_node_set_free (&dest_nodes);
2579 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2580 && err != REG_NOERROR))
2581 goto free_return;
2582 }
2583 /* We need to check recursively if the backreference can epsilon
2584 transit. */
2585 if (subexp_len == 0
2586 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2587 {
2588 err = check_subexp_matching_top (mctx, new_dest_nodes,
2589 cur_str_idx);
2590 if (__glibc_unlikely (err != REG_NOERROR))
2591 goto free_return;
2592 err = transit_state_bkref (mctx, new_dest_nodes);
2593 if (__glibc_unlikely (err != REG_NOERROR))
2594 goto free_return;
2595 }
2596 }
2597 }
2598 err = REG_NOERROR;
2599 free_return:
2600 return err;
2601}
2602
2603/* Enumerate all the candidates which the backreference BKREF_NODE can match
2604 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2605 Note that we might collect inappropriate candidates here.
2606 However, the cost of checking them strictly here is too high, then we
2607 delay these checking for prune_impossible_nodes(). */
2608
2609static reg_errcode_t
2610__attribute_warn_unused_result__
2611get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2612{
2613 const re_dfa_t *const dfa = mctx->dfa;
2614 Idx subexp_num, sub_top_idx;
2615 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2616 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2617 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2618 if (cache_idx != -1)
2619 {
2620 const struct re_backref_cache_entry *entry
2621 = mctx->bkref_ents + cache_idx;
2622 do
2623 if (entry->node == bkref_node)
2624 return REG_NOERROR; /* We already checked it. */
2625 while (entry++->more);
2626 }
2627
2628 subexp_num = dfa->nodes[bkref_node].opr.idx;
2629
2630 /* For each sub expression */
2631 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2632 {
2633 reg_errcode_t err;
2634 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2635 re_sub_match_last_t *sub_last;
2636 Idx sub_last_idx, sl_str, bkref_str_off;
2637
2638 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2639 continue; /* It isn't related. */
2640
2641 sl_str = sub_top->str_idx;
2642 bkref_str_off = bkref_str_idx;
2643 /* At first, check the last node of sub expressions we already
2644 evaluated. */
2645 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2646 {
2647 regoff_t sl_str_diff;
2648 sub_last = sub_top->lasts[sub_last_idx];
2649 sl_str_diff = sub_last->str_idx - sl_str;
2650 /* The matched string by the sub expression match with the substring
2651 at the back reference? */
2652 if (sl_str_diff > 0)
2653 {
2654 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2655 > mctx->input.valid_len))
2656 {
2657 /* Not enough chars for a successful match. */
2658 if (bkref_str_off + sl_str_diff > mctx->input.len)
2659 break;
2660
2661 err = clean_state_log_if_needed (mctx,
2662 bkref_str_off
2663 + sl_str_diff);
2664 if (__glibc_unlikely (err != REG_NOERROR))
2665 return err;
2666 buf = (const char *) re_string_get_buffer (&mctx->input);
2667 }
2668 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2669 /* We don't need to search this sub expression any more. */
2670 break;
2671 }
2672 bkref_str_off += sl_str_diff;
2673 sl_str += sl_str_diff;
2674 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2675 bkref_str_idx);
2676
2677 /* Reload buf, since the preceding call might have reallocated
2678 the buffer. */
2679 buf = (const char *) re_string_get_buffer (&mctx->input);
2680
2681 if (err == REG_NOMATCH)
2682 continue;
2683 if (__glibc_unlikely (err != REG_NOERROR))
2684 return err;
2685 }
2686
2687 if (sub_last_idx < sub_top->nlasts)
2688 continue;
2689 if (sub_last_idx > 0)
2690 ++sl_str;
2691 /* Then, search for the other last nodes of the sub expression. */
2692 for (; sl_str <= bkref_str_idx; ++sl_str)
2693 {
2694 Idx cls_node;
2695 regoff_t sl_str_off;
2696 const re_node_set *nodes;
2697 sl_str_off = sl_str - sub_top->str_idx;
2698 /* The matched string by the sub expression match with the substring
2699 at the back reference? */
2700 if (sl_str_off > 0)
2701 {
2702 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2703 {
2704 /* If we are at the end of the input, we cannot match. */
2705 if (bkref_str_off >= mctx->input.len)
2706 break;
2707
2708 err = extend_buffers (mctx, bkref_str_off + 1);
2709 if (__glibc_unlikely (err != REG_NOERROR))
2710 return err;
2711
2712 buf = (const char *) re_string_get_buffer (&mctx->input);
2713 }
2714 if (buf [bkref_str_off++] != buf[sl_str - 1])
2715 break; /* We don't need to search this sub expression
2716 any more. */
2717 }
2718 if (mctx->state_log[sl_str] == NULL)
2719 continue;
2720 /* Does this state have a ')' of the sub expression? */
2721 nodes = &mctx->state_log[sl_str]->nodes;
2722 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2723 OP_CLOSE_SUBEXP);
2724 if (cls_node == -1)
2725 continue; /* No. */
2726 if (sub_top->path == NULL)
2727 {
2728 sub_top->path = calloc (sizeof (state_array_t),
2729 sl_str - sub_top->str_idx + 1);
2730 if (sub_top->path == NULL)
2731 return REG_ESPACE;
2732 }
2733 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2734 in the current context? */
2735 err = check_arrival (mctx, sub_top->path, sub_top->node,
2736 sub_top->str_idx, cls_node, sl_str,
2737 OP_CLOSE_SUBEXP);
2738 if (err == REG_NOMATCH)
2739 continue;
2740 if (__glibc_unlikely (err != REG_NOERROR))
2741 return err;
2742 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2743 if (__glibc_unlikely (sub_last == NULL))
2744 return REG_ESPACE;
2745 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2746 bkref_str_idx);
2747 buf = (const char *) re_string_get_buffer (&mctx->input);
2748 if (err == REG_NOMATCH)
2749 continue;
2750 if (__glibc_unlikely (err != REG_NOERROR))
2751 return err;
2752 }
2753 }
2754 return REG_NOERROR;
2755}
2756
2757/* Helper functions for get_subexp(). */
2758
2759/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2760 If it can arrive, register the sub expression expressed with SUB_TOP
2761 and SUB_LAST. */
2762
2763static reg_errcode_t
2764get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2765 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2766{
2767 reg_errcode_t err;
2768 Idx to_idx;
2769 /* Can the subexpression arrive the back reference? */
2770 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2771 sub_last->str_idx, bkref_node, bkref_str,
2772 OP_OPEN_SUBEXP);
2773 if (err != REG_NOERROR)
2774 return err;
2775 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2776 sub_last->str_idx);
2777 if (__glibc_unlikely (err != REG_NOERROR))
2778 return err;
2779 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2780 return clean_state_log_if_needed (mctx, to_idx);
2781}
2782
2783/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2784 Search '(' if FL_OPEN, or search ')' otherwise.
2785 TODO: This function isn't efficient...
2786 Because there might be more than one nodes whose types are
2787 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2788 nodes.
2789 E.g. RE: (a){2} */
2790
2791static Idx
2792find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2793 Idx subexp_idx, int type)
2794{
2795 Idx cls_idx;
2796 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2797 {
2798 Idx cls_node = nodes->elems[cls_idx];
2799 const re_token_t *node = dfa->nodes + cls_node;
2800 if (node->type == type
2801 && node->opr.idx == subexp_idx)
2802 return cls_node;
2803 }
2804 return -1;
2805}
2806
2807/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2808 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2809 heavily reused.
2810 Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2811 REG_ESPACE if memory is exhausted. */
2812
2813static reg_errcode_t
2814__attribute_warn_unused_result__
2815check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2816 Idx top_str, Idx last_node, Idx last_str, int type)
2817{
2818 const re_dfa_t *const dfa = mctx->dfa;
2819 reg_errcode_t err = REG_NOERROR;
2820 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2821 re_dfastate_t *cur_state = NULL;
2822 re_node_set *cur_nodes, next_nodes;
2823 re_dfastate_t **backup_state_log;
2824 unsigned int context;
2825
2826 subexp_num = dfa->nodes[top_node].opr.idx;
2827 /* Extend the buffer if we need. */
2828 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2829 {
2830 re_dfastate_t **new_array;
2831 Idx old_alloc = path->alloc;
2832 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2833 Idx new_alloc;
2834 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2835 return REG_ESPACE;
2836 new_alloc = old_alloc + incr_alloc;
2837 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2838 return REG_ESPACE;
2839 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2840 if (__glibc_unlikely (new_array == NULL))
2841 return REG_ESPACE;
2842 path->array = new_array;
2843 path->alloc = new_alloc;
2844 memset (new_array + old_alloc, '\0',
2845 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2846 }
2847
2848 str_idx = path->next_idx ? path->next_idx : top_str;
2849
2850 /* Temporary modify MCTX. */
2851 backup_state_log = mctx->state_log;
2852 backup_cur_idx = mctx->input.cur_idx;
2853 mctx->state_log = path->array;
2854 mctx->input.cur_idx = str_idx;
2855
2856 /* Setup initial node set. */
2857 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2858 if (str_idx == top_str)
2859 {
2860 err = re_node_set_init_1 (&next_nodes, top_node);
2861 if (__glibc_unlikely (err != REG_NOERROR))
2862 return err;
2863 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2864 if (__glibc_unlikely (err != REG_NOERROR))
2865 {
2866 re_node_set_free (&next_nodes);
2867 return err;
2868 }
2869 }
2870 else
2871 {
2872 cur_state = mctx->state_log[str_idx];
2873 if (cur_state && cur_state->has_backref)
2874 {
2875 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2876 if (__glibc_unlikely (err != REG_NOERROR))
2877 return err;
2878 }
2879 else
2880 re_node_set_init_empty (&next_nodes);
2881 }
2882 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2883 {
2884 if (next_nodes.nelem)
2885 {
2886 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2887 subexp_num, type);
2888 if (__glibc_unlikely (err != REG_NOERROR))
2889 {
2890 re_node_set_free (&next_nodes);
2891 return err;
2892 }
2893 }
2894 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2895 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2896 {
2897 re_node_set_free (&next_nodes);
2898 return err;
2899 }
2900 mctx->state_log[str_idx] = cur_state;
2901 }
2902
2903 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2904 {
2905 re_node_set_empty (&next_nodes);
2906 if (mctx->state_log[str_idx + 1])
2907 {
2908 err = re_node_set_merge (&next_nodes,
2909 &mctx->state_log[str_idx + 1]->nodes);
2910 if (__glibc_unlikely (err != REG_NOERROR))
2911 {
2912 re_node_set_free (&next_nodes);
2913 return err;
2914 }
2915 }
2916 if (cur_state)
2917 {
2918 err = check_arrival_add_next_nodes (mctx, str_idx,
2919 &cur_state->non_eps_nodes,
2920 &next_nodes);
2921 if (__glibc_unlikely (err != REG_NOERROR))
2922 {
2923 re_node_set_free (&next_nodes);
2924 return err;
2925 }
2926 }
2927 ++str_idx;
2928 if (next_nodes.nelem)
2929 {
2930 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2931 if (__glibc_unlikely (err != REG_NOERROR))
2932 {
2933 re_node_set_free (&next_nodes);
2934 return err;
2935 }
2936 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2937 subexp_num, type);
2938 if (__glibc_unlikely (err != REG_NOERROR))
2939 {
2940 re_node_set_free (&next_nodes);
2941 return err;
2942 }
2943 }
2944 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2945 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2946 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2947 {
2948 re_node_set_free (&next_nodes);
2949 return err;
2950 }
2951 mctx->state_log[str_idx] = cur_state;
2952 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2953 }
2954 re_node_set_free (&next_nodes);
2955 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2956 : &mctx->state_log[last_str]->nodes);
2957 path->next_idx = str_idx;
2958
2959 /* Fix MCTX. */
2960 mctx->state_log = backup_state_log;
2961 mctx->input.cur_idx = backup_cur_idx;
2962
2963 /* Then check the current node set has the node LAST_NODE. */
2964 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2965 return REG_NOERROR;
2966
2967 return REG_NOMATCH;
2968}
2969
2970/* Helper functions for check_arrival. */
2971
2972/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2973 to NEXT_NODES.
2974 TODO: This function is similar to the functions transit_state*(),
2975 however this function has many additional works.
2976 Can't we unify them? */
2977
2978static reg_errcode_t
2979__attribute_warn_unused_result__
2980check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2981 re_node_set *cur_nodes, re_node_set *next_nodes)
2982{
2983 const re_dfa_t *const dfa = mctx->dfa;
2984 bool ok;
2985 Idx cur_idx;
2986 reg_errcode_t err = REG_NOERROR;
2987 re_node_set union_set;
2988 re_node_set_init_empty (&union_set);
2989 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2990 {
2991 int naccepted = 0;
2992 Idx cur_node = cur_nodes->elems[cur_idx];
2993 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2994
2995 /* If the node may accept "multi byte". */
2996 if (dfa->nodes[cur_node].accept_mb)
2997 {
2998 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2999 str_idx);
3000 if (naccepted > 1)
3001 {
3002 re_dfastate_t *dest_state;
3003 Idx next_node = dfa->nexts[cur_node];
3004 Idx next_idx = str_idx + naccepted;
3005 dest_state = mctx->state_log[next_idx];
3006 re_node_set_empty (&union_set);
3007 if (dest_state)
3008 {
3009 err = re_node_set_merge (&union_set, &dest_state->nodes);
3010 if (__glibc_unlikely (err != REG_NOERROR))
3011 {
3012 re_node_set_free (&union_set);
3013 return err;
3014 }
3015 }
3016 ok = re_node_set_insert (&union_set, next_node);
3017 if (__glibc_unlikely (! ok))
3018 {
3019 re_node_set_free (&union_set);
3020 return REG_ESPACE;
3021 }
3022 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3023 &union_set);
3024 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3025 && err != REG_NOERROR))
3026 {
3027 re_node_set_free (&union_set);
3028 return err;
3029 }
3030 }
3031 }
3032
3033 if (naccepted
3034 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3035 {
3036 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3037 if (__glibc_unlikely (! ok))
3038 {
3039 re_node_set_free (&union_set);
3040 return REG_ESPACE;
3041 }
3042 }
3043 }
3044 re_node_set_free (&union_set);
3045 return REG_NOERROR;
3046}
3047
3048/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3049 CUR_NODES, however exclude the nodes which are:
3050 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3051 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3052*/
3053
3054static reg_errcode_t
3055check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3056 Idx ex_subexp, int type)
3057{
3058 reg_errcode_t err;
3059 Idx idx, outside_node;
3060 re_node_set new_nodes;
3061 DEBUG_ASSERT (cur_nodes->nelem);
3062 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3063 if (__glibc_unlikely (err != REG_NOERROR))
3064 return err;
3065 /* Create a new node set NEW_NODES with the nodes which are epsilon
3066 closures of the node in CUR_NODES. */
3067
3068 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3069 {
3070 Idx cur_node = cur_nodes->elems[idx];
3071 const re_node_set *eclosure = dfa->eclosures + cur_node;
3072 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3073 if (outside_node == -1)
3074 {
3075 /* There are no problematic nodes, just merge them. */
3076 err = re_node_set_merge (&new_nodes, eclosure);
3077 if (__glibc_unlikely (err != REG_NOERROR))
3078 {
3079 re_node_set_free (&new_nodes);
3080 return err;
3081 }
3082 }
3083 else
3084 {
3085 /* There are problematic nodes, re-calculate incrementally. */
3086 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3087 ex_subexp, type);
3088 if (__glibc_unlikely (err != REG_NOERROR))
3089 {
3090 re_node_set_free (&new_nodes);
3091 return err;
3092 }
3093 }
3094 }
3095 re_node_set_free (cur_nodes);
3096 *cur_nodes = new_nodes;
3097 return REG_NOERROR;
3098}
3099
3100/* Helper function for check_arrival_expand_ecl.
3101 Check incrementally the epsilon closure of TARGET, and if it isn't
3102 problematic append it to DST_NODES. */
3103
3104static reg_errcode_t
3105__attribute_warn_unused_result__
3106check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3107 Idx target, Idx ex_subexp, int type)
3108{
3109 Idx cur_node;
3110 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3111 {
3112 bool ok;
3113
3114 if (dfa->nodes[cur_node].type == type
3115 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3116 {
3117 if (type == OP_CLOSE_SUBEXP)
3118 {
3119 ok = re_node_set_insert (dst_nodes, cur_node);
3120 if (__glibc_unlikely (! ok))
3121 return REG_ESPACE;
3122 }
3123 break;
3124 }
3125 ok = re_node_set_insert (dst_nodes, cur_node);
3126 if (__glibc_unlikely (! ok))
3127 return REG_ESPACE;
3128 if (dfa->edests[cur_node].nelem == 0)
3129 break;
3130 if (dfa->edests[cur_node].nelem == 2)
3131 {
3132 reg_errcode_t err;
3133 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3134 dfa->edests[cur_node].elems[1],
3135 ex_subexp, type);
3136 if (__glibc_unlikely (err != REG_NOERROR))
3137 return err;
3138 }
3139 cur_node = dfa->edests[cur_node].elems[0];
3140 }
3141 return REG_NOERROR;
3142}
3143
3144
3145/* For all the back references in the current state, calculate the
3146 destination of the back references by the appropriate entry
3147 in MCTX->BKREF_ENTS. */
3148
3149static reg_errcode_t
3150__attribute_warn_unused_result__
3151expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3152 Idx cur_str, Idx subexp_num, int type)
3153{
3154 const re_dfa_t *const dfa = mctx->dfa;
3155 reg_errcode_t err;
3156 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3157 struct re_backref_cache_entry *ent;
3158
3159 if (cache_idx_start == -1)
3160 return REG_NOERROR;
3161
3162 restart:
3163 ent = mctx->bkref_ents + cache_idx_start;
3164 do
3165 {
3166 Idx to_idx, next_node;
3167
3168 /* Is this entry ENT is appropriate? */
3169 if (!re_node_set_contains (cur_nodes, ent->node))
3170 continue; /* No. */
3171
3172 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3173 /* Calculate the destination of the back reference, and append it
3174 to MCTX->STATE_LOG. */
3175 if (to_idx == cur_str)
3176 {
3177 /* The backreference did epsilon transit, we must re-check all the
3178 node in the current state. */
3179 re_node_set new_dests;
3180 reg_errcode_t err2, err3;
3181 next_node = dfa->edests[ent->node].elems[0];
3182 if (re_node_set_contains (cur_nodes, next_node))
3183 continue;
3184 err = re_node_set_init_1 (&new_dests, next_node);
3185 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3186 err3 = re_node_set_merge (cur_nodes, &new_dests);
3187 re_node_set_free (&new_dests);
3188 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3189 || err3 != REG_NOERROR))
3190 {
3191 err = (err != REG_NOERROR ? err
3192 : (err2 != REG_NOERROR ? err2 : err3));
3193 return err;
3194 }
3195 /* TODO: It is still inefficient... */
3196 goto restart;
3197 }
3198 else
3199 {
3200 re_node_set union_set;
3201 next_node = dfa->nexts[ent->node];
3202 if (mctx->state_log[to_idx])
3203 {
3204 bool ok;
3205 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3206 next_node))
3207 continue;
3208 err = re_node_set_init_copy (&union_set,
3209 &mctx->state_log[to_idx]->nodes);
3210 ok = re_node_set_insert (&union_set, next_node);
3211 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3212 {
3213 re_node_set_free (&union_set);
3214 err = err != REG_NOERROR ? err : REG_ESPACE;
3215 return err;
3216 }
3217 }
3218 else
3219 {
3220 err = re_node_set_init_1 (&union_set, next_node);
3221 if (__glibc_unlikely (err != REG_NOERROR))
3222 return err;
3223 }
3224 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3225 re_node_set_free (&union_set);
3226 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3227 && err != REG_NOERROR))
3228 return err;
3229 }
3230 }
3231 while (ent++->more);
3232 return REG_NOERROR;
3233}
3234
3235/* Build transition table for the state.
3236 Return true if successful. */
3237
3238static bool __attribute_noinline__
3239build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3240{
3241 reg_errcode_t err;
3242 Idx i, j;
3243 int ch;
3244 bool need_word_trtable = false;
3245 bitset_word_t elem, mask;
3246 Idx ndests; /* Number of the destination states from 'state'. */
3247 re_dfastate_t **trtable;
3248 re_dfastate_t *dest_states[SBC_MAX];
3249 re_dfastate_t *dest_states_word[SBC_MAX];
3250 re_dfastate_t *dest_states_nl[SBC_MAX];
3251 re_node_set follows;
3252 bitset_t acceptable;
3253
3254 /* We build DFA states which corresponds to the destination nodes
3255 from 'state'. 'dests_node[i]' represents the nodes which i-th
3256 destination state contains, and 'dests_ch[i]' represents the
3257 characters which i-th destination state accepts. */
3258 re_node_set dests_node[SBC_MAX];
3259 bitset_t dests_ch[SBC_MAX];
3260
3261 /* Initialize transition table. */
3262 state->word_trtable = state->trtable = NULL;
3263
3264 /* At first, group all nodes belonging to 'state' into several
3265 destinations. */
3266 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3267 if (__glibc_unlikely (ndests <= 0))
3268 {
3269 /* Return false in case of an error, true otherwise. */
3270 if (ndests == 0)
3271 {
3272 state->trtable = (re_dfastate_t **)
3273 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3274 if (__glibc_unlikely (state->trtable == NULL))
3275 return false;
3276 return true;
3277 }
3278 return false;
3279 }
3280
3281 err = re_node_set_alloc (&follows, ndests + 1);
3282 if (__glibc_unlikely (err != REG_NOERROR))
3283 {
3284 out_free:
3285 re_node_set_free (&follows);
3286 for (i = 0; i < ndests; ++i)
3287 re_node_set_free (dests_node + i);
3288 return false;
3289 }
3290
3291 bitset_empty (acceptable);
3292
3293 /* Then build the states for all destinations. */
3294 for (i = 0; i < ndests; ++i)
3295 {
3296 Idx next_node;
3297 re_node_set_empty (&follows);
3298 /* Merge the follows of this destination states. */
3299 for (j = 0; j < dests_node[i].nelem; ++j)
3300 {
3301 next_node = dfa->nexts[dests_node[i].elems[j]];
3302 if (next_node != -1)
3303 {
3304 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3305 if (__glibc_unlikely (err != REG_NOERROR))
3306 goto out_free;
3307 }
3308 }
3309 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3310 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3311 goto out_free;
3312 /* If the new state has context constraint,
3313 build appropriate states for these contexts. */
3314 if (dest_states[i]->has_constraint)
3315 {
3316 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3317 CONTEXT_WORD);
3318 if (__glibc_unlikely (dest_states_word[i] == NULL
3319 && err != REG_NOERROR))
3320 goto out_free;
3321
3322 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3323 need_word_trtable = true;
3324
3325 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3326 CONTEXT_NEWLINE);
3327 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3328 goto out_free;
3329 }
3330 else
3331 {
3332 dest_states_word[i] = dest_states[i];
3333 dest_states_nl[i] = dest_states[i];
3334 }
3335 bitset_merge (acceptable, dests_ch[i]);
3336 }
3337
3338 if (!__glibc_unlikely (need_word_trtable))
3339 {
3340 /* We don't care about whether the following character is a word
3341 character, or we are in a single-byte character set so we can
3342 discern by looking at the character code: allocate a
3343 256-entry transition table. */
3344 trtable = state->trtable =
3345 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3346 if (__glibc_unlikely (trtable == NULL))
3347 goto out_free;
3348
3349 /* For all characters ch...: */
3350 for (i = 0; i < BITSET_WORDS; ++i)
3351 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3352 elem;
3353 mask <<= 1, elem >>= 1, ++ch)
3354 if (__glibc_unlikely (elem & 1))
3355 {
3356 /* There must be exactly one destination which accepts
3357 character ch. See group_nodes_into_DFAstates. */
3358 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3359 ;
3360
3361 /* j-th destination accepts the word character ch. */
3362 if (dfa->word_char[i] & mask)
3363 trtable[ch] = dest_states_word[j];
3364 else
3365 trtable[ch] = dest_states[j];
3366 }
3367 }
3368 else
3369 {
3370 /* We care about whether the following character is a word
3371 character, and we are in a multi-byte character set: discern
3372 by looking at the character code: build two 256-entry
3373 transition tables, one starting at trtable[0] and one
3374 starting at trtable[SBC_MAX]. */
3375 trtable = state->word_trtable =
3376 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3377 if (__glibc_unlikely (trtable == NULL))
3378 goto out_free;
3379
3380 /* For all characters ch...: */
3381 for (i = 0; i < BITSET_WORDS; ++i)
3382 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3383 elem;
3384 mask <<= 1, elem >>= 1, ++ch)
3385 if (__glibc_unlikely (elem & 1))
3386 {
3387 /* There must be exactly one destination which accepts
3388 character ch. See group_nodes_into_DFAstates. */
3389 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3390 ;
3391
3392 /* j-th destination accepts the word character ch. */
3393 trtable[ch] = dest_states[j];
3394 trtable[ch + SBC_MAX] = dest_states_word[j];
3395 }
3396 }
3397
3398 /* new line */
3399 if (bitset_contain (acceptable, NEWLINE_CHAR))
3400 {
3401 /* The current state accepts newline character. */
3402 for (j = 0; j < ndests; ++j)
3403 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3404 {
3405 /* k-th destination accepts newline character. */
3406 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3407 if (need_word_trtable)
3408 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3409 /* There must be only one destination which accepts
3410 newline. See group_nodes_into_DFAstates. */
3411 break;
3412 }
3413 }
3414
3415 re_node_set_free (&follows);
3416 for (i = 0; i < ndests; ++i)
3417 re_node_set_free (dests_node + i);
3418 return true;
3419}
3420
3421/* Group all nodes belonging to STATE into several destinations.
3422 Then for all destinations, set the nodes belonging to the destination
3423 to DESTS_NODE[i] and set the characters accepted by the destination
3424 to DEST_CH[i]. Return the number of destinations if successful,
3425 -1 on internal error. */
3426
3427static Idx
3428group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3429 re_node_set *dests_node, bitset_t *dests_ch)
3430{
3431 reg_errcode_t err;
3432 bool ok;
3433 Idx i, j, k;
3434 Idx ndests; /* Number of the destinations from 'state'. */
3435 bitset_t accepts; /* Characters a node can accept. */
3436 const re_node_set *cur_nodes = &state->nodes;
3437 bitset_empty (accepts);
3438 ndests = 0;
3439
3440 /* For all the nodes belonging to 'state', */
3441 for (i = 0; i < cur_nodes->nelem; ++i)
3442 {
3443 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3444 re_token_type_t type = node->type;
3445 unsigned int constraint = node->constraint;
3446
3447 /* Enumerate all single byte character this node can accept. */
3448 if (type == CHARACTER)
3449 bitset_set (accepts, node->opr.c);
3450 else if (type == SIMPLE_BRACKET)
3451 {
3452 bitset_merge (accepts, node->opr.sbcset);
3453 }
3454 else if (type == OP_PERIOD)
3455 {
3456 if (dfa->mb_cur_max > 1)
3457 bitset_merge (accepts, dfa->sb_char);
3458 else
3459 bitset_set_all (accepts);
3460 if (!(dfa->syntax & RE_DOT_NEWLINE))
3461 bitset_clear (accepts, '\n');
3462 if (dfa->syntax & RE_DOT_NOT_NULL)
3463 bitset_clear (accepts, '\0');
3464 }
3465 else if (type == OP_UTF8_PERIOD)
3466 {
3467 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3468 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3469 else
3470 bitset_merge (accepts, utf8_sb_map);
3471 if (!(dfa->syntax & RE_DOT_NEWLINE))
3472 bitset_clear (accepts, '\n');
3473 if (dfa->syntax & RE_DOT_NOT_NULL)
3474 bitset_clear (accepts, '\0');
3475 }
3476 else
3477 continue;
3478
3479 /* Check the 'accepts' and sift the characters which are not
3480 match it the context. */
3481 if (constraint)
3482 {
3483 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3484 {
3485 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3486 bitset_empty (accepts);
3487 if (accepts_newline)
3488 bitset_set (accepts, NEWLINE_CHAR);
3489 else
3490 continue;
3491 }
3492 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3493 {
3494 bitset_empty (accepts);
3495 continue;
3496 }
3497
3498 if (constraint & NEXT_WORD_CONSTRAINT)
3499 {
3500 bitset_word_t any_set = 0;
3501 if (type == CHARACTER && !node->word_char)
3502 {
3503 bitset_empty (accepts);
3504 continue;
3505 }
3506 if (dfa->mb_cur_max > 1)
3507 for (j = 0; j < BITSET_WORDS; ++j)
3508 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3509 else
3510 for (j = 0; j < BITSET_WORDS; ++j)
3511 any_set |= (accepts[j] &= dfa->word_char[j]);
3512 if (!any_set)
3513 continue;
3514 }
3515 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3516 {
3517 bitset_word_t any_set = 0;
3518 if (type == CHARACTER && node->word_char)
3519 {
3520 bitset_empty (accepts);
3521 continue;
3522 }
3523 if (dfa->mb_cur_max > 1)
3524 for (j = 0; j < BITSET_WORDS; ++j)
3525 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3526 else
3527 for (j = 0; j < BITSET_WORDS; ++j)
3528 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3529 if (!any_set)
3530 continue;
3531 }
3532 }
3533
3534 /* Then divide 'accepts' into DFA states, or create a new
3535 state. Above, we make sure that accepts is not empty. */
3536 for (j = 0; j < ndests; ++j)
3537 {
3538 bitset_t intersec; /* Intersection sets, see below. */
3539 bitset_t remains;
3540 /* Flags, see below. */
3541 bitset_word_t has_intersec, not_subset, not_consumed;
3542
3543 /* Optimization, skip if this state doesn't accept the character. */
3544 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3545 continue;
3546
3547 /* Enumerate the intersection set of this state and 'accepts'. */
3548 has_intersec = 0;
3549 for (k = 0; k < BITSET_WORDS; ++k)
3550 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3551 /* And skip if the intersection set is empty. */
3552 if (!has_intersec)
3553 continue;
3554
3555 /* Then check if this state is a subset of 'accepts'. */
3556 not_subset = not_consumed = 0;
3557 for (k = 0; k < BITSET_WORDS; ++k)
3558 {
3559 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3560 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3561 }
3562
3563 /* If this state isn't a subset of 'accepts', create a
3564 new group state, which has the 'remains'. */
3565 if (not_subset)
3566 {
3567 bitset_copy (dests_ch[ndests], remains);
3568 bitset_copy (dests_ch[j], intersec);
3569 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3570 if (__glibc_unlikely (err != REG_NOERROR))
3571 goto error_return;
3572 ++ndests;
3573 }
3574
3575 /* Put the position in the current group. */
3576 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3577 if (__glibc_unlikely (! ok))
3578 goto error_return;
3579
3580 /* If all characters are consumed, go to next node. */
3581 if (!not_consumed)
3582 break;
3583 }
3584 /* Some characters remain, create a new group. */
3585 if (j == ndests)
3586 {
3587 bitset_copy (dests_ch[ndests], accepts);
3588 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3589 if (__glibc_unlikely (err != REG_NOERROR))
3590 goto error_return;
3591 ++ndests;
3592 bitset_empty (accepts);
3593 }
3594 }
3595 assume (ndests <= SBC_MAX);
3596 return ndests;
3597 error_return:
3598 for (j = 0; j < ndests; ++j)
3599 re_node_set_free (dests_node + j);
3600 return -1;
3601}
3602
3603/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3604 Return the number of the bytes the node accepts.
3605 STR_IDX is the current index of the input string.
3606
3607 This function handles the nodes which can accept one character, or
3608 one collating element like '.', '[a-z]', opposite to the other nodes
3609 can only accept one byte. */
3610
3611#ifdef _LIBC
3612# include <locale/weight.h>
3613#endif
3614
3615static int
3616check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3617 const re_string_t *input, Idx str_idx)
3618{
3619 const re_token_t *node = dfa->nodes + node_idx;
3620 int char_len, elem_len;
3621 Idx i;
3622
3623 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3624 {
3625 unsigned char c = re_string_byte_at (input, str_idx), d;
3626 if (__glibc_likely (c < 0xc2))
3627 return 0;
3628
3629 if (str_idx + 2 > input->len)
3630 return 0;
3631
3632 d = re_string_byte_at (input, str_idx + 1);
3633 if (c < 0xe0)
3634 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3635 else if (c < 0xf0)
3636 {
3637 char_len = 3;
3638 if (c == 0xe0 && d < 0xa0)
3639 return 0;
3640 }
3641 else if (c < 0xf8)
3642 {
3643 char_len = 4;
3644 if (c == 0xf0 && d < 0x90)
3645 return 0;
3646 }
3647 else if (c < 0xfc)
3648 {
3649 char_len = 5;
3650 if (c == 0xf8 && d < 0x88)
3651 return 0;
3652 }
3653 else if (c < 0xfe)
3654 {
3655 char_len = 6;
3656 if (c == 0xfc && d < 0x84)
3657 return 0;
3658 }
3659 else
3660 return 0;
3661
3662 if (str_idx + char_len > input->len)
3663 return 0;
3664
3665 for (i = 1; i < char_len; ++i)
3666 {
3667 d = re_string_byte_at (input, str_idx + i);
3668 if (d < 0x80 || d > 0xbf)
3669 return 0;
3670 }
3671 return char_len;
3672 }
3673
3674 char_len = re_string_char_size_at (input, str_idx);
3675 if (node->type == OP_PERIOD)
3676 {
3677 if (char_len <= 1)
3678 return 0;
3679 /* FIXME: I don't think this if is needed, as both '\n'
3680 and '\0' are char_len == 1. */
3681 /* '.' accepts any one character except the following two cases. */
3682 if ((!(dfa->syntax & RE_DOT_NEWLINE)
3683 && re_string_byte_at (input, str_idx) == '\n')
3684 || ((dfa->syntax & RE_DOT_NOT_NULL)
3685 && re_string_byte_at (input, str_idx) == '\0'))
3686 return 0;
3687 return char_len;
3688 }
3689
3690 elem_len = re_string_elem_size_at (input, str_idx);
3691 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3692 return 0;
3693
3694 if (node->type == COMPLEX_BRACKET)
3695 {
3696 const re_charset_t *cset = node->opr.mbcset;
3697#ifdef _LIBC
3698 const unsigned char *pin
3699 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3700 Idx j;
3701 uint32_t nrules;
3702#endif
3703 int match_len = 0;
3704 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3705 ? re_string_wchar_at (input, str_idx) : 0);
3706
3707 /* match with multibyte character? */
3708 for (i = 0; i < cset->nmbchars; ++i)
3709 if (wc == cset->mbchars[i])
3710 {
3711 match_len = char_len;
3712 goto check_node_accept_bytes_match;
3713 }
3714 /* match with character_class? */
3715 for (i = 0; i < cset->nchar_classes; ++i)
3716 {
3717 wctype_t wt = cset->char_classes[i];
3718 if (__iswctype (wc, wt))
3719 {
3720 match_len = char_len;
3721 goto check_node_accept_bytes_match;
3722 }
3723 }
3724
3725#ifdef _LIBC
3726 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3727 if (nrules != 0)
3728 {
3729 unsigned int in_collseq = 0;
3730 const int32_t *table, *indirect;
3731 const unsigned char *weights, *extra;
3732 const char *collseqwc;
3733
3734 /* match with collating_symbol? */
3735 if (cset->ncoll_syms)
3736 extra = (const unsigned char *)
3737 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3738 for (i = 0; i < cset->ncoll_syms; ++i)
3739 {
3740 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3741 /* Compare the length of input collating element and
3742 the length of current collating element. */
3743 if (*coll_sym != elem_len)
3744 continue;
3745 /* Compare each bytes. */
3746 for (j = 0; j < *coll_sym; j++)
3747 if (pin[j] != coll_sym[1 + j])
3748 break;
3749 if (j == *coll_sym)
3750 {
3751 /* Match if every bytes is equal. */
3752 match_len = j;
3753 goto check_node_accept_bytes_match;
3754 }
3755 }
3756
3757 if (cset->nranges)
3758 {
3759 if (elem_len <= char_len)
3760 {
3761 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3762 in_collseq = __collseq_table_lookup (collseqwc, wc);
3763 }
3764 else
3765 in_collseq = find_collation_sequence_value (pin, elem_len);
3766 }
3767 /* match with range expression? */
3768 /* FIXME: Implement rational ranges here, too. */
3769 for (i = 0; i < cset->nranges; ++i)
3770 if (cset->range_starts[i] <= in_collseq
3771 && in_collseq <= cset->range_ends[i])
3772 {
3773 match_len = elem_len;
3774 goto check_node_accept_bytes_match;
3775 }
3776
3777 /* match with equivalence_class? */
3778 if (cset->nequiv_classes)
3779 {
3780 const unsigned char *cp = pin;
3781 table = (const int32_t *)
3782 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3783 weights = (const unsigned char *)
3784 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3785 extra = (const unsigned char *)
3786 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3787 indirect = (const int32_t *)
3788 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3789 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3790 int32_t rule = idx >> 24;
3791 idx &= 0xffffff;
3792 if (idx > 0)
3793 {
3794 size_t weight_len = weights[idx];
3795 for (i = 0; i < cset->nequiv_classes; ++i)
3796 {
3797 int32_t equiv_class_idx = cset->equiv_classes[i];
3798 int32_t equiv_class_rule = equiv_class_idx >> 24;
3799 equiv_class_idx &= 0xffffff;
3800 if (weights[equiv_class_idx] == weight_len
3801 && equiv_class_rule == rule
3802 && memcmp (weights + idx + 1,
3803 weights + equiv_class_idx + 1,
3804 weight_len) == 0)
3805 {
3806 match_len = elem_len;
3807 goto check_node_accept_bytes_match;
3808 }
3809 }
3810 }
3811 }
3812 }
3813 else
3814#endif /* _LIBC */
3815 {
3816 /* match with range expression? */
3817 for (i = 0; i < cset->nranges; ++i)
3818 {
3819 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3820 {
3821 match_len = char_len;
3822 goto check_node_accept_bytes_match;
3823 }
3824 }
3825 }
3826 check_node_accept_bytes_match:
3827 if (!cset->non_match)
3828 return match_len;
3829 else
3830 {
3831 if (match_len > 0)
3832 return 0;
3833 else
3834 return (elem_len > char_len) ? elem_len : char_len;
3835 }
3836 }
3837 return 0;
3838}
3839
3840#ifdef _LIBC
3841static unsigned int
3842find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3843{
3844 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3845 if (nrules == 0)
3846 {
3847 if (mbs_len == 1)
3848 {
3849 /* No valid character. Match it as a single byte character. */
3850 const unsigned char *collseq = (const unsigned char *)
3851 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3852 return collseq[mbs[0]];
3853 }
3854 return UINT_MAX;
3855 }
3856 else
3857 {
3858 int32_t idx;
3859 const unsigned char *extra = (const unsigned char *)
3860 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3861 int32_t extrasize = (const unsigned char *)
3862 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3863
3864 for (idx = 0; idx < extrasize;)
3865 {
3866 int mbs_cnt;
3867 bool found = false;
3868 int32_t elem_mbs_len;
3869 /* Skip the name of collating element name. */
3870 idx = idx + extra[idx] + 1;
3871 elem_mbs_len = extra[idx++];
3872 if (mbs_len == elem_mbs_len)
3873 {
3874 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3875 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3876 break;
3877 if (mbs_cnt == elem_mbs_len)
3878 /* Found the entry. */
3879 found = true;
3880 }
3881 /* Skip the byte sequence of the collating element. */
3882 idx += elem_mbs_len;
3883 /* Adjust for the alignment. */
3884 idx = (idx + 3) & ~3;
3885 /* Skip the collation sequence value. */
3886 idx += sizeof (uint32_t);
3887 /* Skip the wide char sequence of the collating element. */
3888 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3889 /* If we found the entry, return the sequence value. */
3890 if (found)
3891 return *(uint32_t *) (extra + idx);
3892 /* Skip the collation sequence value. */
3893 idx += sizeof (uint32_t);
3894 }
3895 return UINT_MAX;
3896 }
3897}
3898#endif /* _LIBC */
3899
3900/* Check whether the node accepts the byte which is IDX-th
3901 byte of the INPUT. */
3902
3903static bool
3904check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3905 Idx idx)
3906{
3907 unsigned char ch;
3908 ch = re_string_byte_at (&mctx->input, idx);
3909 switch (node->type)
3910 {
3911 case CHARACTER:
3912 if (node->opr.c != ch)
3913 return false;
3914 break;
3915
3916 case SIMPLE_BRACKET:
3917 if (!bitset_contain (node->opr.sbcset, ch))
3918 return false;
3919 break;
3920
3921 case OP_UTF8_PERIOD:
3922 if (ch >= ASCII_CHARS)
3923 return false;
3924 FALLTHROUGH;
3925 case OP_PERIOD:
3926 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3927 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3928 return false;
3929 break;
3930
3931 default:
3932 return false;
3933 }
3934
3935 if (node->constraint)
3936 {
3937 /* The node has constraints. Check whether the current context
3938 satisfies the constraints. */
3939 unsigned int context = re_string_context_at (&mctx->input, idx,
3940 mctx->eflags);
3941 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3942 return false;
3943 }
3944
3945 return true;
3946}
3947
3948/* Extend the buffers, if the buffers have run out. */
3949
3950static reg_errcode_t
3951__attribute_warn_unused_result__
3952extend_buffers (re_match_context_t *mctx, int min_len)
3953{
3954 reg_errcode_t ret;
3955 re_string_t *pstr = &mctx->input;
3956
3957 /* Avoid overflow. */
3958 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3959 <= pstr->bufs_len))
3960 return REG_ESPACE;
3961
3962 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3963 ret = re_string_realloc_buffers (pstr,
3964 MAX (min_len,
3965 MIN (pstr->len, pstr->bufs_len * 2)));
3966 if (__glibc_unlikely (ret != REG_NOERROR))
3967 return ret;
3968
3969 if (mctx->state_log != NULL)
3970 {
3971 /* And double the length of state_log. */
3972 /* XXX We have no indication of the size of this buffer. If this
3973 allocation fail we have no indication that the state_log array
3974 does not have the right size. */
3975 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3976 pstr->bufs_len + 1);
3977 if (__glibc_unlikely (new_array == NULL))
3978 return REG_ESPACE;
3979 mctx->state_log = new_array;
3980 }
3981
3982 /* Then reconstruct the buffers. */
3983 if (pstr->icase)
3984 {
3985 if (pstr->mb_cur_max > 1)
3986 {
3987 ret = build_wcs_upper_buffer (pstr);
3988 if (__glibc_unlikely (ret != REG_NOERROR))
3989 return ret;
3990 }
3991 else
3992 build_upper_buffer (pstr);
3993 }
3994 else
3995 {
3996 if (pstr->mb_cur_max > 1)
3997 build_wcs_buffer (pstr);
3998 else
3999 {
4000 if (pstr->trans != NULL)
4001 re_string_translate_buffer (pstr);
4002 }
4003 }
4004 return REG_NOERROR;
4005}
4006
4007
4008
4009/* Functions for matching context. */
4010
4011/* Initialize MCTX. */
4012
4013static reg_errcode_t
4014__attribute_warn_unused_result__
4015match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4016{
4017 mctx->eflags = eflags;
4018 mctx->match_last = -1;
4019 if (n > 0)
4020 {
4021 /* Avoid overflow. */
4022 size_t max_object_size =
4023 MAX (sizeof (struct re_backref_cache_entry),
4024 sizeof (re_sub_match_top_t *));
4025 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4026 return REG_ESPACE;
4027
4028 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4029 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4030 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4031 return REG_ESPACE;
4032 }
4033 /* Already zero-ed by the caller.
4034 else
4035 mctx->bkref_ents = NULL;
4036 mctx->nbkref_ents = 0;
4037 mctx->nsub_tops = 0; */
4038 mctx->abkref_ents = n;
4039 mctx->max_mb_elem_len = 1;
4040 mctx->asub_tops = n;
4041 return REG_NOERROR;
4042}
4043
4044/* Clean the entries which depend on the current input in MCTX.
4045 This function must be invoked when the matcher changes the start index
4046 of the input, or changes the input string. */
4047
4048static void
4049match_ctx_clean (re_match_context_t *mctx)
4050{
4051 Idx st_idx;
4052 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4053 {
4054 Idx sl_idx;
4055 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4056 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4057 {
4058 re_sub_match_last_t *last = top->lasts[sl_idx];
4059 re_free (last->path.array);
4060 re_free (last);
4061 }
4062 re_free (top->lasts);
4063 if (top->path)
4064 {
4065 re_free (top->path->array);
4066 re_free (top->path);
4067 }
4068 re_free (top);
4069 }
4070
4071 mctx->nsub_tops = 0;
4072 mctx->nbkref_ents = 0;
4073}
4074
4075/* Free all the memory associated with MCTX. */
4076
4077static void
4078match_ctx_free (re_match_context_t *mctx)
4079{
4080 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4081 match_ctx_clean (mctx);
4082 re_free (mctx->sub_tops);
4083 re_free (mctx->bkref_ents);
4084}
4085
4086/* Add a new backreference entry to MCTX.
4087 Note that we assume that caller never call this function with duplicate
4088 entry, and call with STR_IDX which isn't smaller than any existing entry.
4089*/
4090
4091static reg_errcode_t
4092__attribute_warn_unused_result__
4093match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4094 Idx to)
4095{
4096 if (mctx->nbkref_ents >= mctx->abkref_ents)
4097 {
4098 struct re_backref_cache_entry* new_entry;
4099 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4100 mctx->abkref_ents * 2);
4101 if (__glibc_unlikely (new_entry == NULL))
4102 {
4103 re_free (mctx->bkref_ents);
4104 return REG_ESPACE;
4105 }
4106 mctx->bkref_ents = new_entry;
4107 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4108 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4109 mctx->abkref_ents *= 2;
4110 }
4111 if (mctx->nbkref_ents > 0
4112 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4113 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4114
4115 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4116 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4117 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4118 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4119
4120 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4121 If bit N is clear, means that this entry won't epsilon-transition to
4122 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4123 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4124 such node.
4125
4126 A backreference does not epsilon-transition unless it is empty, so set
4127 to all zeros if FROM != TO. */
4128 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4129 = (from == to ? -1 : 0);
4130
4131 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4132 if (mctx->max_mb_elem_len < to - from)
4133 mctx->max_mb_elem_len = to - from;
4134 return REG_NOERROR;
4135}
4136
4137/* Return the first entry with the same str_idx, or -1 if none is
4138 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4139
4140static Idx
4141search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4142{
4143 Idx left, right, mid, last;
4144 last = right = mctx->nbkref_ents;
4145 for (left = 0; left < right;)
4146 {
4147 mid = (left + right) / 2;
4148 if (mctx->bkref_ents[mid].str_idx < str_idx)
4149 left = mid + 1;
4150 else
4151 right = mid;
4152 }
4153 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4154 return left;
4155 else
4156 return -1;
4157}
4158
4159/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4160 at STR_IDX. */
4161
4162static reg_errcode_t
4163__attribute_warn_unused_result__
4164match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4165{
4166 DEBUG_ASSERT (mctx->sub_tops != NULL);
4167 DEBUG_ASSERT (mctx->asub_tops > 0);
4168 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4169 {
4170 Idx new_asub_tops = mctx->asub_tops * 2;
4171 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4172 re_sub_match_top_t *,
4173 new_asub_tops);
4174 if (__glibc_unlikely (new_array == NULL))
4175 return REG_ESPACE;
4176 mctx->sub_tops = new_array;
4177 mctx->asub_tops = new_asub_tops;
4178 }
4179 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4180 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4181 return REG_ESPACE;
4182 mctx->sub_tops[mctx->nsub_tops]->node = node;
4183 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4184 return REG_NOERROR;
4185}
4186
4187/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4188 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4189 Return the new entry if successful, NULL if memory is exhausted. */
4190
4191static re_sub_match_last_t *
4192match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4193{
4194 re_sub_match_last_t *new_entry;
4195 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4196 {
4197 Idx new_alasts = 2 * subtop->alasts + 1;
4198 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4199 re_sub_match_last_t *,
4200 new_alasts);
4201 if (__glibc_unlikely (new_array == NULL))
4202 return NULL;
4203 subtop->lasts = new_array;
4204 subtop->alasts = new_alasts;
4205 }
4206 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4207 if (__glibc_likely (new_entry != NULL))
4208 {
4209 subtop->lasts[subtop->nlasts] = new_entry;
4210 new_entry->node = node;
4211 new_entry->str_idx = str_idx;
4212 ++subtop->nlasts;
4213 }
4214 return new_entry;
4215}
4216
4217static void
4218sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4219 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4220{
4221 sctx->sifted_states = sifted_sts;
4222 sctx->limited_states = limited_sts;
4223 sctx->last_node = last_node;
4224 sctx->last_str_idx = last_str_idx;
4225 re_node_set_init_empty (&sctx->limits);
4226}
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette