regexec.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 8 months ago
src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge ^{/vendor/sed/4.1.5}/vendor/sed/current .)
File size: 125.2 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. */
19
20	static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21	Idx n);
22	static void match_ctx_clean (re_match_context_t *mctx);
23	static void match_ctx_free (re_match_context_t *cache);
24	static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25	Idx str_idx, Idx from, Idx to);
26	static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27	static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28	Idx str_idx);
29	static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30	Idx node, Idx str_idx);
31	static void sift_ctx_init (re_sift_context_t sctx, re_dfastate_t *sifted_sts,
32	re_dfastate_t **limited_sts, Idx last_node,
33	Idx last_str_idx);
34	static reg_errcode_t re_search_internal (const regex_t *preg,
35	const char *string, Idx length,
36	Idx start, Idx last_start, Idx stop,
37	size_t nmatch, regmatch_t pmatch[],
38	int eflags);
39	static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40	const char *string1, Idx length1,
41	const char *string2, Idx length2,
42	Idx start, regoff_t range,
43	struct re_registers *regs,
44	Idx stop, bool ret_len);
45	static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46	const char *string, Idx length, Idx start,
47	regoff_t range, Idx stop,
48	struct re_registers *regs,
49	bool ret_len);
50	static unsigned re_copy_regs (struct re_registers regs, regmatch_t pmatch,
51	Idx nregs, int regs_allocated);
52	static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53	static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54	Idx *p_match_first);
55	static Idx check_halt_state_context (const re_match_context_t *mctx,
56	const re_dfastate_t *state, Idx idx);
57	static void update_regs (const re_dfa_t dfa, regmatch_t pmatch,
58	regmatch_t *prev_idx_match, Idx cur_node,
59	Idx cur_idx, Idx nmatch);
60	static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61	Idx str_idx, Idx dest_node, Idx nregs,
62	regmatch_t regs, regmatch_t prevregs,
63	re_node_set *eps_via_nodes);
64	static reg_errcode_t set_regs (const regex_t *preg,
65	const re_match_context_t *mctx,
66	size_t nmatch, regmatch_t *pmatch,
67	bool fl_backtrack);
68	static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70	static int sift_states_iter_mb (const re_match_context_t *mctx,
71	re_sift_context_t *sctx,
72	Idx node_idx, Idx str_idx, Idx max_str_idx);
73	static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74	re_sift_context_t *sctx);
75	static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76	re_sift_context_t *sctx, Idx str_idx,
77	re_node_set *cur_dest);
78	static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79	re_sift_context_t *sctx,
80	Idx str_idx,
81	re_node_set *dest_nodes);
82	static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83	re_node_set *dest_nodes,
84	const re_node_set *candidates);
85	static bool check_dst_limits (const re_match_context_t *mctx,
86	const re_node_set *limits,
87	Idx dst_node, Idx dst_idx, Idx src_node,
88	Idx src_idx);
89	static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90	int boundaries, Idx subexp_idx,
91	Idx from_node, Idx bkref_idx);
92	static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93	Idx limit, Idx subexp_idx,
94	Idx node, Idx str_idx,
95	Idx bkref_idx);
96	static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97	re_node_set *dest_nodes,
98	const re_node_set *candidates,
99	re_node_set *limits,
100	struct re_backref_cache_entry *bkref_ents,
101	Idx str_idx);
102	static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103	re_sift_context_t *sctx,
104	Idx str_idx, const re_node_set *candidates);
105	static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
106	re_dfastate_t **dst,
107	re_dfastate_t **src, Idx num);
108	static re_dfastate_t find_recover_state (reg_errcode_t err,
109	re_match_context_t *mctx);
110	static re_dfastate_t transit_state (reg_errcode_t err,
111	re_match_context_t *mctx,
112	re_dfastate_t *state);
113	static re_dfastate_t merge_state_with_log (reg_errcode_t err,
114	re_match_context_t *mctx,
115	re_dfastate_t *next_state);
116	static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117	re_node_set *cur_nodes,
118	Idx str_idx);
119	#if 0
120	static re_dfastate_t transit_state_sb (reg_errcode_t err,
121	re_match_context_t *mctx,
122	re_dfastate_t *pstate);
123	#endif
124	static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125	re_dfastate_t *pstate);
126	static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127	const re_node_set *nodes);
128	static reg_errcode_t get_subexp (re_match_context_t *mctx,
129	Idx bkref_node, Idx bkref_str_idx);
130	static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131	const re_sub_match_top_t *sub_top,
132	re_sub_match_last_t *sub_last,
133	Idx bkref_node, Idx bkref_str);
134	static Idx find_subexp_node (const re_dfa_t dfa, const re_node_set nodes,
135	Idx subexp_idx, int type);
136	static reg_errcode_t check_arrival (re_match_context_t *mctx,
137	state_array_t *path, Idx top_node,
138	Idx top_str, Idx last_node, Idx last_str,
139	int type);
140	static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
141	Idx str_idx,
142	re_node_set *cur_nodes,
143	re_node_set *next_nodes);
144	static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145	re_node_set *cur_nodes,
146	Idx ex_subexp, int type);
147	static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148	re_node_set *dst_nodes,
149	Idx target, Idx ex_subexp,
150	int type);
151	static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152	re_node_set *cur_nodes, Idx cur_str,
153	Idx subexp_num, int type);
154	static bool build_trtable (const re_dfa_t dfa, re_dfastate_t state);
155	static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156	const re_string_t *input, Idx idx);
157	#ifdef _LIBC
158	static unsigned int find_collation_sequence_value (const unsigned char *mbs,
159	size_t name_len);
160	#endif
161	static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162	const re_dfastate_t *state,
163	re_node_set *states_node,
164	bitset_t *states_ch);
165	static bool check_node_accept (const re_match_context_t *mctx,
166	const re_token_t *node, Idx idx);
167	static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
168
169
170	/* Entry point for POSIX code. */
171
172	/* regexec searches for a given pattern, specified by PREG, in the
173	string STRING.
174
175	If NMATCH is zero or REG_NOSUB was set in the cflags argument to
176	'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
177	least NMATCH elements, and we set them to the offsets of the
178	corresponding matched substrings.
179
180	EFLAGS specifies "execution flags" which affect matching: if
181	REG_NOTBOL is set, then ^ does not match at the beginning of the
182	string; if REG_NOTEOL is set, then $ does not match at the end.
183
184	Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
185	EFLAGS is invalid. */
186
187	int
188	regexec (const regex_t __restrict preg, const char __restrict string,
189	size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
190	{
191	reg_errcode_t err;
192	Idx start, length;
193	re_dfa_t *dfa = preg->buffer;
194
195	if (eflags & ~(REG_NOTBOL \| REG_NOTEOL \| REG_STARTEND))
196	return REG_BADPAT;
197
198	if (eflags & REG_STARTEND)
199	{
200	start = pmatch[0].rm_so;
201	length = pmatch[0].rm_eo;
202	}
203	else
204	{
205	start = 0;
206	length = strlen (string);
207	}
208
209	lock_lock (dfa->lock);
210	if (preg->no_sub)
211	err = re_search_internal (preg, string, length, start, length,
212	length, 0, NULL, eflags);
213	else
214	err = re_search_internal (preg, string, length, start, length,
215	length, nmatch, pmatch, eflags);
216	lock_unlock (dfa->lock);
217	return err != REG_NOERROR;
218	}
219
220	#ifdef _LIBC
221	libc_hidden_def (__regexec)
222
223	# include <shlib-compat.h>
224	versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
225
226	# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
227	__typeof__ (__regexec) __compat_regexec;
228
229	int
230	attribute_compat_text_section
231	__compat_regexec (const regex_t *__restrict preg,
232	const char *__restrict string, size_t nmatch,
233	regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
234	{
235	return regexec (preg, string, nmatch, pmatch,
236	eflags & (REG_NOTBOL \| REG_NOTEOL));
237	}
238	compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
239	# endif
240	#endif
241
242	/* Entry points for GNU code. */
243
244	/* re_match, re_search, re_match_2, re_search_2
245
246	The former two functions operate on STRING with length LENGTH,
247	while the later two operate on concatenation of STRING1 and STRING2
248	with lengths LENGTH1 and LENGTH2, respectively.
249
250	re_match() matches the compiled pattern in BUFP against the string,
251	starting at index START.
252
253	re_search() first tries matching at index START, then it tries to match
254	starting from index START + 1, and so on. The last start position tried
255	is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
256	way as re_match().)
257
258	The parameter STOP of re_{match,search}_2 specifies that no match exceeding
259	the first STOP characters of the concatenation of the strings should be
260	concerned.
261
262	If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
263	and all groups is stored in REGS. (For the "_2" variants, the offsets are
264	computed relative to the concatenation, not relative to the individual
265	strings.)
266
267	On success, re_match* functions return the length of the match, re_search*
268	return the position of the start of the match. They return -1 on
269	match failure, -2 on error. */
270
271	regoff_t
272	re_match (struct re_pattern_buffer bufp, const char string, Idx length,
273	Idx start, struct re_registers *regs)
274	{
275	return re_search_stub (bufp, string, length, start, 0, length, regs, true);
276	}
277	#ifdef _LIBC
278	weak_alias (__re_match, re_match)
279	#endif
280
281	regoff_t
282	re_search (struct re_pattern_buffer bufp, const char string, Idx length,
283	Idx start, regoff_t range, struct re_registers *regs)
284	{
285	return re_search_stub (bufp, string, length, start, range, length, regs,
286	false);
287	}
288	#ifdef _LIBC
289	weak_alias (__re_search, re_search)
290	#endif
291
292	regoff_t
293	re_match_2 (struct re_pattern_buffer bufp, const char string1, Idx length1,
294	const char *string2, Idx length2, Idx start,
295	struct re_registers *regs, Idx stop)
296	{
297	return re_search_2_stub (bufp, string1, length1, string2, length2,
298	start, 0, regs, stop, true);
299	}
300	#ifdef _LIBC
301	weak_alias (__re_match_2, re_match_2)
302	#endif
303
304	regoff_t
305	re_search_2 (struct re_pattern_buffer bufp, const char string1, Idx length1,
306	const char *string2, Idx length2, Idx start, regoff_t range,
307	struct re_registers *regs, Idx stop)
308	{
309	return re_search_2_stub (bufp, string1, length1, string2, length2,
310	start, range, regs, stop, false);
311	}
312	#ifdef _LIBC
313	weak_alias (__re_search_2, re_search_2)
314	#endif
315
316	static regoff_t
317	re_search_2_stub (struct re_pattern_buffer bufp, const char string1,
318	Idx length1, const char *string2, Idx length2, Idx start,
319	regoff_t range, struct re_registers *regs,
320	Idx stop, bool ret_len)
321	{
322	const char *str;
323	regoff_t rval;
324	Idx len;
325	char *s = NULL;
326
327	if (__glibc_unlikely ((length1 < 0 \|\| length2 < 0 \|\| stop < 0
328	\|\| INT_ADD_WRAPV (length1, length2, &len))))
329	return -2;
330
331	/* Concatenate the strings. */
332	if (length2 > 0)
333	if (length1 > 0)
334	{
335	s = re_malloc (char, len);
336
337	if (__glibc_unlikely (s == NULL))
338	return -2;
339	#ifdef _LIBC
340	memcpy (__mempcpy (s, string1, length1), string2, length2);
341	#else
342	memcpy (s, string1, length1);
343	memcpy (s + length1, string2, length2);
344	#endif
345	str = s;
346	}
347	else
348	str = string2;
349	else
350	str = string1;
351
352	rval = re_search_stub (bufp, str, len, start, range, stop, regs,
353	ret_len);
354	re_free (s);
355	return rval;
356	}
357
358	/* The parameters have the same meaning as those of re_search.
359	Additional parameters:
360	If RET_LEN is true the length of the match is returned (re_match style);
361	otherwise the position of the match is returned. */
362
363	static regoff_t
364	re_search_stub (struct re_pattern_buffer bufp, const char string, Idx length,
365	Idx start, regoff_t range, Idx stop, struct re_registers *regs,
366	bool ret_len)
367	{
368	reg_errcode_t result;
369	regmatch_t *pmatch;
370	Idx nregs;
371	regoff_t rval;
372	int eflags = 0;
373	re_dfa_t *dfa = bufp->buffer;
374	Idx last_start = start + range;
375
376	/* Check for out-of-range. */
377	if (__glibc_unlikely (start < 0 \|\| start > length))
378	return -1;
379	if (__glibc_unlikely (length < last_start
380	\|\| (0 <= range && last_start < start)))
381	last_start = length;
382	else if (__glibc_unlikely (last_start < 0
383	\|\| (range < 0 && start <= last_start)))
384	last_start = 0;
385
386	lock_lock (dfa->lock);
387
388	eflags \|= (bufp->not_bol) ? REG_NOTBOL : 0;
389	eflags \|= (bufp->not_eol) ? REG_NOTEOL : 0;
390
391	/* Compile fastmap if we haven't yet. */
392	if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
393	re_compile_fastmap (bufp);
394
395	if (__glibc_unlikely (bufp->no_sub))
396	regs = NULL;
397
398	/* We need at least 1 register. */
399	if (regs == NULL)
400	nregs = 1;
401	else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
402	&& regs->num_regs <= bufp->re_nsub))
403	{
404	nregs = regs->num_regs;
405	if (__glibc_unlikely (nregs < 1))
406	{
407	/* Nothing can be copied to regs. */
408	regs = NULL;
409	nregs = 1;
410	}
411	}
412	else
413	nregs = bufp->re_nsub + 1;
414	pmatch = re_malloc (regmatch_t, nregs);
415	if (__glibc_unlikely (pmatch == NULL))
416	{
417	rval = -2;
418	goto out;
419	}
420
421	result = re_search_internal (bufp, string, length, start, last_start, stop,
422	nregs, pmatch, eflags);
423
424	rval = 0;
425
426	/* I hope we needn't fill their regs with -1's when no match was found. */
427	if (result != REG_NOERROR)
428	rval = result == REG_NOMATCH ? -1 : -2;
429	else if (regs != NULL)
430	{
431	/* If caller wants register contents data back, copy them. */
432	bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
433	bufp->regs_allocated);
434	if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
435	rval = -2;
436	}
437
438	if (__glibc_likely (rval == 0))
439	{
440	if (ret_len)
441	{
442	DEBUG_ASSERT (pmatch[0].rm_so == start);
443	rval = pmatch[0].rm_eo - start;
444	}
445	else
446	rval = pmatch[0].rm_so;
447	}
448	re_free (pmatch);
449	out:
450	lock_unlock (dfa->lock);
451	return rval;
452	}
453
454	static unsigned
455	re_copy_regs (struct re_registers regs, regmatch_t pmatch, Idx nregs,
456	int regs_allocated)
457	{
458	int rval = REGS_REALLOCATE;
459	Idx i;
460	Idx need_regs = nregs + 1;
461	/* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
462	uses. */
463
464	/* Have the register data arrays been allocated? */
465	if (regs_allocated == REGS_UNALLOCATED)
466	{ /* No. So allocate them with malloc. */
467	regs->start = re_malloc (regoff_t, need_regs);
468	if (__glibc_unlikely (regs->start == NULL))
469	return REGS_UNALLOCATED;
470	regs->end = re_malloc (regoff_t, need_regs);
471	if (__glibc_unlikely (regs->end == NULL))
472	{
473	re_free (regs->start);
474	return REGS_UNALLOCATED;
475	}
476	regs->num_regs = need_regs;
477	}
478	else if (regs_allocated == REGS_REALLOCATE)
479	{ /* Yes. If we need more elements than were already
480	allocated, reallocate them. If we need fewer, just
481	leave it alone. */
482	if (__glibc_unlikely (need_regs > regs->num_regs))
483	{
484	regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
485	regoff_t *new_end;
486	if (__glibc_unlikely (new_start == NULL))
487	return REGS_UNALLOCATED;
488	new_end = re_realloc (regs->end, regoff_t, need_regs);
489	if (__glibc_unlikely (new_end == NULL))
490	{
491	re_free (new_start);
492	return REGS_UNALLOCATED;
493	}
494	regs->start = new_start;
495	regs->end = new_end;
496	regs->num_regs = need_regs;
497	}
498	}
499	else
500	{
501	DEBUG_ASSERT (regs_allocated == REGS_FIXED);
502	/* This function may not be called with REGS_FIXED and nregs too big. */
503	DEBUG_ASSERT (nregs <= regs->num_regs);
504	rval = REGS_FIXED;
505	}
506
507	/* Copy the regs. */
508	for (i = 0; i < nregs; ++i)
509	{
510	regs->start[i] = pmatch[i].rm_so;
511	regs->end[i] = pmatch[i].rm_eo;
512	}
513	for ( ; i < regs->num_regs; ++i)
514	regs->start[i] = regs->end[i] = -1;
515
516	return rval;
517	}
518
519	/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
520	ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
521	this memory for recording register information. STARTS and ENDS
522	must be allocated using the malloc library routine, and must each
523	be at least NUM_REGS * sizeof (regoff_t) bytes long.
524
525	If NUM_REGS == 0, then subsequent matches should allocate their own
526	register data.
527
528	Unless this function is called, the first search or match using
529	PATTERN_BUFFER will allocate its own register data, without
530	freeing the old data. */
531
532	void
533	re_set_registers (struct re_pattern_buffer bufp, struct re_registers regs,
534	__re_size_t num_regs, regoff_t starts, regoff_t ends)
535	{
536	if (num_regs)
537	{
538	bufp->regs_allocated = REGS_REALLOCATE;
539	regs->num_regs = num_regs;
540	regs->start = starts;
541	regs->end = ends;
542	}
543	else
544	{
545	bufp->regs_allocated = REGS_UNALLOCATED;
546	regs->num_regs = 0;
547	regs->start = regs->end = NULL;
548	}
549	}
550	#ifdef _LIBC
551	weak_alias (__re_set_registers, re_set_registers)
552	#endif
553
554
555	/* Entry points compatible with 4.2 BSD regex library. We don't define
556	them unless specifically requested. */
557
558	#if defined _REGEX_RE_COMP \|\| defined _LIBC
559	int
560	# ifdef _LIBC
561	weak_function
562	# endif
563	re_exec (const char *s)
564	{
565	return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
566	}
567	#endif /* _REGEX_RE_COMP */
568
569
570	/* Internal entry point. */
571
572	/* Searches for a compiled pattern PREG in the string STRING, whose
573	length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
574	meaning as with regexec. LAST_START is START + RANGE, where
575	START and RANGE have the same meaning as with re_search.
576	Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
577	otherwise return the error code.
578	Note: We assume front end functions already check ranges.
579	(0 <= LAST_START && LAST_START <= LENGTH) */
580
581	static reg_errcode_t
582	__attribute_warn_unused_result__
583	re_search_internal (const regex_t preg, const char string, Idx length,
584	Idx start, Idx last_start, Idx stop, size_t nmatch,
585	regmatch_t pmatch[], int eflags)
586	{
587	reg_errcode_t err;
588	const re_dfa_t *dfa = preg->buffer;
589	Idx left_lim, right_lim;
590	int incr;
591	bool fl_longest_match;
592	int match_kind;
593	Idx match_first;
594	Idx match_last = -1;
595	Idx extra_nmatch;
596	bool sb;
597	int ch;
598	re_match_context_t mctx = { .dfa = dfa };
599	char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
600	&& start != last_start && !preg->can_be_null)
601	? preg->fastmap : NULL);
602	RE_TRANSLATE_TYPE t = preg->translate;
603
604	extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
605	nmatch -= extra_nmatch;
606
607	/* Check if the DFA haven't been compiled. */
608	if (__glibc_unlikely (preg->used == 0 \|\| dfa->init_state == NULL
609	\|\| dfa->init_state_word == NULL
610	\|\| dfa->init_state_nl == NULL
611	\|\| dfa->init_state_begbuf == NULL))
612	return REG_NOMATCH;
613
614	/* We assume front-end functions already check them. */
615	DEBUG_ASSERT (0 <= last_start && last_start <= length);
616
617	/* If initial states with non-begbuf contexts have no elements,
618	the regex must be anchored. If preg->newline_anchor is set,
619	we'll never use init_state_nl, so do not check it. */
620	if (dfa->init_state->nodes.nelem == 0
621	&& dfa->init_state_word->nodes.nelem == 0
622	&& (dfa->init_state_nl->nodes.nelem == 0
623	\|\| !preg->newline_anchor))
624	{
625	if (start != 0 && last_start != 0)
626	return REG_NOMATCH;
627	start = last_start = 0;
628	}
629
630	/* We must check the longest matching, if nmatch > 0. */
631	fl_longest_match = (nmatch != 0 \|\| dfa->nbackref);
632
633	err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
634	preg->translate, (preg->syntax & RE_ICASE) != 0,
635	dfa);
636	if (__glibc_unlikely (err != REG_NOERROR))
637	goto free_return;
638	mctx.input.stop = stop;
639	mctx.input.raw_stop = stop;
640	mctx.input.newline_anchor = preg->newline_anchor;
641
642	err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
643	if (__glibc_unlikely (err != REG_NOERROR))
644	goto free_return;
645
646	/* We will log all the DFA states through which the dfa pass,
647	if nmatch > 1, or this dfa has "multibyte node", which is a
648	back-reference or a node which can accept multibyte character or
649	multi character collating element. */
650	if (nmatch > 1 \|\| dfa->has_mb_node)
651	{
652	/* Avoid overflow. */
653	if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
654	<= mctx.input.bufs_len)))
655	{
656	err = REG_ESPACE;
657	goto free_return;
658	}
659
660	mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
661	if (__glibc_unlikely (mctx.state_log == NULL))
662	{
663	err = REG_ESPACE;
664	goto free_return;
665	}
666	}
667
668	match_first = start;
669	mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
670	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF;
671
672	/* Check incrementally whether the input string matches. */
673	incr = (last_start < start) ? -1 : 1;
674	left_lim = (last_start < start) ? last_start : start;
675	right_lim = (last_start < start) ? start : last_start;
676	sb = dfa->mb_cur_max == 1;
677	match_kind =
678	(fastmap
679	? ((sb \|\| !(preg->syntax & RE_ICASE \|\| t) ? 4 : 0)
680	\| (start <= last_start ? 2 : 0)
681	\| (t != NULL ? 1 : 0))
682	: 8);
683
684	for (;; match_first += incr)
685	{
686	err = REG_NOMATCH;
687	if (match_first < left_lim \|\| right_lim < match_first)
688	goto free_return;
689
690	/* Advance as rapidly as possible through the string, until we
691	find a plausible place to start matching. This may be done
692	with varying efficiency, so there are various possibilities:
693	only the most common of them are specialized, in order to
694	save on code size. We use a switch statement for speed. */
695	switch (match_kind)
696	{
697	case 8:
698	/* No fastmap. */
699	break;
700
701	case 7:
702	/* Fastmap with single-byte translation, match forward. */
703	while (__glibc_likely (match_first < right_lim)
704	&& !fastmap[t[(unsigned char) string[match_first]]])
705	++match_first;
706	goto forward_match_found_start_or_reached_end;
707
708	case 6:
709	/* Fastmap without translation, match forward. */
710	while (__glibc_likely (match_first < right_lim)
711	&& !fastmap[(unsigned char) string[match_first]])
712	++match_first;
713
714	forward_match_found_start_or_reached_end:
715	if (__glibc_unlikely (match_first == right_lim))
716	{
717	ch = match_first >= length
718	? 0 : (unsigned char) string[match_first];
719	if (!fastmap[t ? t[ch] : ch])
720	goto free_return;
721	}
722	break;
723
724	case 4:
725	case 5:
726	/* Fastmap without multi-byte translation, match backwards. */
727	while (match_first >= left_lim)
728	{
729	ch = match_first >= length
730	? 0 : (unsigned char) string[match_first];
731	if (fastmap[t ? t[ch] : ch])
732	break;
733	--match_first;
734	}
735	if (match_first < left_lim)
736	goto free_return;
737	break;
738
739	default:
740	/* In this case, we can't determine easily the current byte,
741	since it might be a component byte of a multibyte
742	character. Then we use the constructed buffer instead. */
743	for (;;)
744	{
745	/* If MATCH_FIRST is out of the valid range, reconstruct the
746	buffers. */
747	__re_size_t offset = match_first - mctx.input.raw_mbs_idx;
748	if (__glibc_unlikely (offset
749	>= (__re_size_t) mctx.input.valid_raw_len))
750	{
751	err = re_string_reconstruct (&mctx.input, match_first,
752	eflags);
753	if (__glibc_unlikely (err != REG_NOERROR))
754	goto free_return;
755
756	offset = match_first - mctx.input.raw_mbs_idx;
757	}
758	/* Use buffer byte if OFFSET is in buffer, otherwise '\0'. */
759	ch = (offset < mctx.input.valid_len
760	? re_string_byte_at (&mctx.input, offset) : 0);
761	if (fastmap[ch])
762	break;
763	match_first += incr;
764	if (match_first < left_lim \|\| match_first > right_lim)
765	{
766	err = REG_NOMATCH;
767	goto free_return;
768	}
769	}
770	break;
771	}
772
773	/* Reconstruct the buffers so that the matcher can assume that
774	the matching starts from the beginning of the buffer. */
775	err = re_string_reconstruct (&mctx.input, match_first, eflags);
776	if (__glibc_unlikely (err != REG_NOERROR))
777	goto free_return;
778
779	/* Don't consider this char as a possible match start if it part,
780	yet isn't the head, of a multibyte character. */
781	if (!sb && !re_string_first_byte (&mctx.input, 0))
782	continue;
783
784	/* It seems to be appropriate one, then use the matcher. */
785	/* We assume that the matching starts from 0. */
786	mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
787	match_last = check_matching (&mctx, fl_longest_match,
788	start <= last_start ? &match_first : NULL);
789	if (match_last != -1)
790	{
791	if (__glibc_unlikely (match_last == -2))
792	{
793	err = REG_ESPACE;
794	goto free_return;
795	}
796	else
797	{
798	mctx.match_last = match_last;
799	if ((!preg->no_sub && nmatch > 1) \|\| dfa->nbackref)
800	{
801	re_dfastate_t *pstate = mctx.state_log[match_last];
802	mctx.last_node = check_halt_state_context (&mctx, pstate,
803	match_last);
804	}
805	if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
806	\|\| dfa->nbackref)
807	{
808	err = prune_impossible_nodes (&mctx);
809	if (err == REG_NOERROR)
810	break;
811	if (__glibc_unlikely (err != REG_NOMATCH))
812	goto free_return;
813	match_last = -1;
814	}
815	else
816	break; /* We found a match. */
817	}
818	}
819
820	match_ctx_clean (&mctx);
821	}
822
823	DEBUG_ASSERT (match_last != -1);
824	DEBUG_ASSERT (err == REG_NOERROR);
825
826	/* Set pmatch[] if we need. */
827	if (nmatch > 0)
828	{
829	Idx reg_idx;
830
831	/* Initialize registers. */
832	for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
833	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
834
835	/* Set the points where matching start/end. */
836	pmatch[0].rm_so = 0;
837	pmatch[0].rm_eo = mctx.match_last;
838	/* FIXME: This function should fail if mctx.match_last exceeds
839	the maximum possible regoff_t value. We need a new error
840	code REG_OVERFLOW. */
841
842	if (!preg->no_sub && nmatch > 1)
843	{
844	err = set_regs (preg, &mctx, nmatch, pmatch,
845	dfa->has_plural_match && dfa->nbackref > 0);
846	if (__glibc_unlikely (err != REG_NOERROR))
847	goto free_return;
848	}
849
850	/* At last, add the offset to each register, since we slid
851	the buffers so that we could assume that the matching starts
852	from 0. */
853	for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
854	if (pmatch[reg_idx].rm_so != -1)
855	{
856	if (__glibc_unlikely (mctx.input.offsets_needed != 0))
857	{
858	pmatch[reg_idx].rm_so =
859	(pmatch[reg_idx].rm_so == mctx.input.valid_len
860	? mctx.input.valid_raw_len
861	: mctx.input.offsets[pmatch[reg_idx].rm_so]);
862	pmatch[reg_idx].rm_eo =
863	(pmatch[reg_idx].rm_eo == mctx.input.valid_len
864	? mctx.input.valid_raw_len
865	: mctx.input.offsets[pmatch[reg_idx].rm_eo]);
866	}
867	pmatch[reg_idx].rm_so += match_first;
868	pmatch[reg_idx].rm_eo += match_first;
869	}
870	for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
871	{
872	pmatch[nmatch + reg_idx].rm_so = -1;
873	pmatch[nmatch + reg_idx].rm_eo = -1;
874	}
875
876	if (dfa->subexp_map)
877	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
878	if (dfa->subexp_map[reg_idx] != reg_idx)
879	{
880	pmatch[reg_idx + 1].rm_so
881	= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
882	pmatch[reg_idx + 1].rm_eo
883	= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
884	}
885	}
886
887	free_return:
888	re_free (mctx.state_log);
889	if (dfa->nbackref)
890	match_ctx_free (&mctx);
891	re_string_destruct (&mctx.input);
892	return err;
893	}
894
895	static reg_errcode_t
896	__attribute_warn_unused_result__
897	prune_impossible_nodes (re_match_context_t *mctx)
898	{
899	const re_dfa_t *const dfa = mctx->dfa;
900	Idx halt_node, match_last;
901	reg_errcode_t ret;
902	re_dfastate_t **sifted_states;
903	re_dfastate_t **lim_states = NULL;
904	re_sift_context_t sctx;
905	DEBUG_ASSERT (mctx->state_log != NULL);
906	match_last = mctx->match_last;
907	halt_node = mctx->last_node;
908
909	/* Avoid overflow. */
910	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
911	<= match_last))
912	return REG_ESPACE;
913
914	sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
915	if (__glibc_unlikely (sifted_states == NULL))
916	{
917	ret = REG_ESPACE;
918	goto free_return;
919	}
920	if (dfa->nbackref)
921	{
922	lim_states = re_malloc (re_dfastate_t *, match_last + 1);
923	if (__glibc_unlikely (lim_states == NULL))
924	{
925	ret = REG_ESPACE;
926	goto free_return;
927	}
928	while (1)
929	{
930	memset (lim_states, '\0',
931	sizeof (re_dfastate_t ) (match_last + 1));
932	sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
933	match_last);
934	ret = sift_states_backward (mctx, &sctx);
935	re_node_set_free (&sctx.limits);
936	if (__glibc_unlikely (ret != REG_NOERROR))
937	goto free_return;
938	if (sifted_states[0] != NULL \|\| lim_states[0] != NULL)
939	break;
940	do
941	{
942	--match_last;
943	if (match_last < 0)
944	{
945	ret = REG_NOMATCH;
946	goto free_return;
947	}
948	} while (mctx->state_log[match_last] == NULL
949	\|\| !mctx->state_log[match_last]->halt);
950	halt_node = check_halt_state_context (mctx,
951	mctx->state_log[match_last],
952	match_last);
953	}
954	ret = merge_state_array (dfa, sifted_states, lim_states,
955	match_last + 1);
956	re_free (lim_states);
957	lim_states = NULL;
958	if (__glibc_unlikely (ret != REG_NOERROR))
959	goto free_return;
960	}
961	else
962	{
963	sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
964	ret = sift_states_backward (mctx, &sctx);
965	re_node_set_free (&sctx.limits);
966	if (__glibc_unlikely (ret != REG_NOERROR))
967	goto free_return;
968	if (sifted_states[0] == NULL)
969	{
970	ret = REG_NOMATCH;
971	goto free_return;
972	}
973	}
974	re_free (mctx->state_log);
975	mctx->state_log = sifted_states;
976	sifted_states = NULL;
977	mctx->last_node = halt_node;
978	mctx->match_last = match_last;
979	ret = REG_NOERROR;
980	free_return:
981	re_free (sifted_states);
982	re_free (lim_states);
983	return ret;
984	}
985
986	/* Acquire an initial state and return it.
987	We must select appropriate initial state depending on the context,
988	since initial states may have constraints like "\<", "^", etc.. */
989
990	static __always_inline re_dfastate_t *
991	acquire_init_state_context (reg_errcode_t err, const re_match_context_t mctx,
992	Idx idx)
993	{
994	const re_dfa_t *const dfa = mctx->dfa;
995	if (dfa->init_state->has_constraint)
996	{
997	unsigned int context;
998	context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
999	if (IS_WORD_CONTEXT (context))
1000	return dfa->init_state_word;
1001	else if (IS_ORDINARY_CONTEXT (context))
1002	return dfa->init_state;
1003	else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1004	return dfa->init_state_begbuf;
1005	else if (IS_NEWLINE_CONTEXT (context))
1006	return dfa->init_state_nl;
1007	else if (IS_BEGBUF_CONTEXT (context))
1008	{
1009	/* It is relatively rare case, then calculate on demand. */
1010	return re_acquire_state_context (err, dfa,
1011	dfa->init_state->entrance_nodes,
1012	context);
1013	}
1014	else
1015	/* Must not happen? */
1016	return dfa->init_state;
1017	}
1018	else
1019	return dfa->init_state;
1020	}
1021
1022	/* Check whether the regular expression match input string INPUT or not,
1023	and return the index where the matching end. Return -1 if
1024	there is no match, and return -2 in case of an error.
1025	FL_LONGEST_MATCH means we want the POSIX longest matching.
1026	If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1027	next place where we may want to try matching.
1028	Note that the matcher assumes that the matching starts from the current
1029	index of the buffer. */
1030
1031	static Idx
1032	__attribute_warn_unused_result__
1033	check_matching (re_match_context_t *mctx, bool fl_longest_match,
1034	Idx *p_match_first)
1035	{
1036	const re_dfa_t *const dfa = mctx->dfa;
1037	reg_errcode_t err;
1038	Idx match = 0;
1039	Idx match_last = -1;
1040	Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1041	re_dfastate_t *cur_state;
1042	bool at_init_state = p_match_first != NULL;
1043	Idx next_start_idx = cur_str_idx;
1044
1045	err = REG_NOERROR;
1046	cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1047	/* An initial state must not be NULL (invalid). */
1048	if (__glibc_unlikely (cur_state == NULL))
1049	{
1050	DEBUG_ASSERT (err == REG_ESPACE);
1051	return -2;
1052	}
1053
1054	if (mctx->state_log != NULL)
1055	{
1056	mctx->state_log[cur_str_idx] = cur_state;
1057
1058	/* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1059	later. E.g. Processing back references. */
1060	if (__glibc_unlikely (dfa->nbackref))
1061	{
1062	at_init_state = false;
1063	err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1064	if (__glibc_unlikely (err != REG_NOERROR))
1065	return err;
1066
1067	if (cur_state->has_backref)
1068	{
1069	err = transit_state_bkref (mctx, &cur_state->nodes);
1070	if (__glibc_unlikely (err != REG_NOERROR))
1071	return err;
1072	}
1073	}
1074	}
1075
1076	/* If the RE accepts NULL string. */
1077	if (__glibc_unlikely (cur_state->halt))
1078	{
1079	if (!cur_state->has_constraint
1080	\|\| check_halt_state_context (mctx, cur_state, cur_str_idx))
1081	{
1082	if (!fl_longest_match)
1083	return cur_str_idx;
1084	else
1085	{
1086	match_last = cur_str_idx;
1087	match = 1;
1088	}
1089	}
1090	}
1091
1092	while (!re_string_eoi (&mctx->input))
1093	{
1094	re_dfastate_t *old_state = cur_state;
1095	Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1096
1097	if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1098	&& mctx->input.bufs_len < mctx->input.len)
1099	\|\| (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1100	&& mctx->input.valid_len < mctx->input.len))
1101	{
1102	err = extend_buffers (mctx, next_char_idx + 1);
1103	if (__glibc_unlikely (err != REG_NOERROR))
1104	{
1105	DEBUG_ASSERT (err == REG_ESPACE);
1106	return -2;
1107	}
1108	}
1109
1110	cur_state = transit_state (&err, mctx, cur_state);
1111	if (mctx->state_log != NULL)
1112	cur_state = merge_state_with_log (&err, mctx, cur_state);
1113
1114	if (cur_state == NULL)
1115	{
1116	/* Reached the invalid state or an error. Try to recover a valid
1117	state using the state log, if available and if we have not
1118	already found a valid (even if not the longest) match. */
1119	if (__glibc_unlikely (err != REG_NOERROR))
1120	return -2;
1121
1122	if (mctx->state_log == NULL
1123	\|\| (match && !fl_longest_match)
1124	\|\| (cur_state = find_recover_state (&err, mctx)) == NULL)
1125	break;
1126	}
1127
1128	if (__glibc_unlikely (at_init_state))
1129	{
1130	if (old_state == cur_state)
1131	next_start_idx = next_char_idx;
1132	else
1133	at_init_state = false;
1134	}
1135
1136	if (cur_state->halt)
1137	{
1138	/* Reached a halt state.
1139	Check the halt state can satisfy the current context. */
1140	if (!cur_state->has_constraint
1141	\|\| check_halt_state_context (mctx, cur_state,
1142	re_string_cur_idx (&mctx->input)))
1143	{
1144	/* We found an appropriate halt state. */
1145	match_last = re_string_cur_idx (&mctx->input);
1146	match = 1;
1147
1148	/* We found a match, do not modify match_first below. */
1149	p_match_first = NULL;
1150	if (!fl_longest_match)
1151	break;
1152	}
1153	}
1154	}
1155
1156	if (p_match_first)
1157	*p_match_first += next_start_idx;
1158
1159	return match_last;
1160	}
1161
1162	/* Check NODE match the current context. */
1163
1164	static bool
1165	check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1166	{
1167	re_token_type_t type = dfa->nodes[node].type;
1168	unsigned int constraint = dfa->nodes[node].constraint;
1169	if (type != END_OF_RE)
1170	return false;
1171	if (!constraint)
1172	return true;
1173	if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1174	return false;
1175	return true;
1176	}
1177
1178	/* Check the halt state STATE match the current context.
1179	Return 0 if not match, if the node, STATE has, is a halt node and
1180	match the context, return the node. */
1181
1182	static Idx
1183	check_halt_state_context (const re_match_context_t *mctx,
1184	const re_dfastate_t *state, Idx idx)
1185	{
1186	Idx i;
1187	unsigned int context;
1188	DEBUG_ASSERT (state->halt);
1189	context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1190	for (i = 0; i < state->nodes.nelem; ++i)
1191	if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1192	return state->nodes.elems[i];
1193	return 0;
1194	}
1195
1196	/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1197	corresponding to the DFA).
1198	Return the destination node, and update EPS_VIA_NODES;
1199	return -1 on match failure, -2 on error. */
1200
1201	static Idx
1202	proceed_next_node (const re_match_context_t mctx, Idx nregs, regmatch_t regs,
1203	regmatch_t *prevregs,
1204	Idx pidx, Idx node, re_node_set eps_via_nodes,
1205	struct re_fail_stack_t *fs)
1206	{
1207	const re_dfa_t *const dfa = mctx->dfa;
1208	if (IS_EPSILON_NODE (dfa->nodes[node].type))
1209	{
1210	re_node_set cur_nodes = &mctx->state_log[pidx]->nodes;
1211	re_node_set *edests = &dfa->edests[node];
1212
1213	if (! re_node_set_contains (eps_via_nodes, node))
1214	{
1215	bool ok = re_node_set_insert (eps_via_nodes, node);
1216	if (__glibc_unlikely (! ok))
1217	return -2;
1218	}
1219
1220	/* Pick a valid destination, or return -1 if none is found. */
1221	Idx dest_node = -1;
1222	for (Idx i = 0; i < edests->nelem; i++)
1223	{
1224	Idx candidate = edests->elems[i];
1225	if (!re_node_set_contains (cur_nodes, candidate))
1226	continue;
1227	if (dest_node == -1)
1228	dest_node = candidate;
1229
1230	else
1231	{
1232	/* In order to avoid infinite loop like "(a)", return the second
1233	epsilon-transition if the first was already considered. */
1234	if (re_node_set_contains (eps_via_nodes, dest_node))
1235	return candidate;
1236
1237	/* Otherwise, push the second epsilon-transition on the fail stack. */
1238	else if (fs != NULL
1239	&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
1240	prevregs, eps_via_nodes))
1241	return -2;
1242
1243	/* We know we are going to exit. */
1244	break;
1245	}
1246	}
1247	return dest_node;
1248	}
1249	else
1250	{
1251	Idx naccepted = 0;
1252	re_token_type_t type = dfa->nodes[node].type;
1253
1254	if (dfa->nodes[node].accept_mb)
1255	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1256	else if (type == OP_BACK_REF)
1257	{
1258	Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1259	if (subexp_idx < nregs)
1260	naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1261	if (fs != NULL)
1262	{
1263	if (subexp_idx >= nregs
1264	\|\| regs[subexp_idx].rm_so == -1
1265	\|\| regs[subexp_idx].rm_eo == -1)
1266	return -1;
1267	else if (naccepted)
1268	{
1269	char buf = (char ) re_string_get_buffer (&mctx->input);
1270	if (mctx->input.valid_len - *pidx < naccepted
1271	\|\| (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1272	naccepted)
1273	!= 0))
1274	return -1;
1275	}
1276	}
1277
1278	if (naccepted == 0)
1279	{
1280	Idx dest_node;
1281	bool ok = re_node_set_insert (eps_via_nodes, node);
1282	if (__glibc_unlikely (! ok))
1283	return -2;
1284	dest_node = dfa->edests[node].elems[0];
1285	if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1286	dest_node))
1287	return dest_node;
1288	}
1289	}
1290
1291	if (naccepted != 0
1292	\|\| check_node_accept (mctx, dfa->nodes + node, *pidx))
1293	{
1294	Idx dest_node = dfa->nexts[node];
1295	pidx = (naccepted == 0) ? pidx + 1 : *pidx + naccepted;
1296	if (fs && (pidx > mctx->match_last \|\| mctx->state_log[pidx] == NULL
1297	\|\| !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1298	dest_node)))
1299	return -1;
1300	re_node_set_empty (eps_via_nodes);
1301	return dest_node;
1302	}
1303	}
1304	return -1;
1305	}
1306
1307	static reg_errcode_t
1308	__attribute_warn_unused_result__
1309	push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1310	Idx nregs, regmatch_t regs, regmatch_t prevregs,
1311	re_node_set *eps_via_nodes)
1312	{
1313	reg_errcode_t err;
1314	Idx num = fs->num;
1315	if (num == fs->alloc)
1316	{
1317	struct re_fail_stack_ent_t *new_array;
1318	new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1319	fs->alloc * 2);
1320	if (new_array == NULL)
1321	return REG_ESPACE;
1322	fs->alloc *= 2;
1323	fs->stack = new_array;
1324	}
1325	fs->stack[num].idx = str_idx;
1326	fs->stack[num].node = dest_node;
1327	fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1328	if (fs->stack[num].regs == NULL)
1329	return REG_ESPACE;
1330	fs->num = num + 1;
1331	memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1332	memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1333	err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1334	return err;
1335	}
1336
1337	static Idx
1338	pop_fail_stack (struct re_fail_stack_t fs, Idx pidx, Idx nregs,
1339	regmatch_t regs, regmatch_t prevregs,
1340	re_node_set *eps_via_nodes)
1341	{
1342	if (fs == NULL \|\| fs->num == 0)
1343	return -1;
1344	Idx num = --fs->num;
1345	*pidx = fs->stack[num].idx;
1346	memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1347	memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1348	re_node_set_free (eps_via_nodes);
1349	re_free (fs->stack[num].regs);
1350	*eps_via_nodes = fs->stack[num].eps_via_nodes;
1351	DEBUG_ASSERT (0 <= fs->stack[num].node);
1352	return fs->stack[num].node;
1353	}
1354
1355
1356	#define DYNARRAY_STRUCT regmatch_list
1357	#define DYNARRAY_ELEMENT regmatch_t
1358	#define DYNARRAY_PREFIX regmatch_list_
1359	#include <malloc/dynarray-skeleton.c>
1360
1361	/* Set the positions where the subexpressions are starts/ends to registers
1362	PMATCH.
1363	Note: We assume that pmatch[0] is already set, and
1364	pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1365
1366	static reg_errcode_t
1367	__attribute_warn_unused_result__
1368	set_regs (const regex_t preg, const re_match_context_t mctx, size_t nmatch,
1369	regmatch_t *pmatch, bool fl_backtrack)
1370	{
1371	const re_dfa_t *dfa = preg->buffer;
1372	Idx idx, cur_node;
1373	re_node_set eps_via_nodes;
1374	struct re_fail_stack_t *fs;
1375	struct re_fail_stack_t fs_body = { 0, 2, NULL };
1376	struct regmatch_list prev_match;
1377	regmatch_list_init (&prev_match);
1378
1379	DEBUG_ASSERT (nmatch > 1);
1380	DEBUG_ASSERT (mctx->state_log != NULL);
1381	if (fl_backtrack)
1382	{
1383	fs = &fs_body;
1384	fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1385	if (fs->stack == NULL)
1386	return REG_ESPACE;
1387	}
1388	else
1389	fs = NULL;
1390
1391	cur_node = dfa->init_node;
1392	re_node_set_init_empty (&eps_via_nodes);
1393
1394	if (!regmatch_list_resize (&prev_match, nmatch))
1395	{
1396	regmatch_list_free (&prev_match);
1397	free_fail_stack_return (fs);
1398	return REG_ESPACE;
1399	}
1400	regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1401	memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1402
1403	for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1404	{
1405	update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1406
1407	if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1408	\|\| (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1409	{
1410	Idx reg_idx;
1411	cur_node = -1;
1412	if (fs)
1413	{
1414	for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1415	if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1416	{
1417	cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1418	prev_idx_match, &eps_via_nodes);
1419	break;
1420	}
1421	}
1422	if (cur_node < 0)
1423	{
1424	re_node_set_free (&eps_via_nodes);
1425	regmatch_list_free (&prev_match);
1426	return free_fail_stack_return (fs);
1427	}
1428	}
1429
1430	/* Proceed to next node. */
1431	cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1432	&idx, cur_node,
1433	&eps_via_nodes, fs);
1434
1435	if (__glibc_unlikely (cur_node < 0))
1436	{
1437	if (__glibc_unlikely (cur_node == -2))
1438	{
1439	re_node_set_free (&eps_via_nodes);
1440	regmatch_list_free (&prev_match);
1441	free_fail_stack_return (fs);
1442	return REG_ESPACE;
1443	}
1444	cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1445	prev_idx_match, &eps_via_nodes);
1446	if (cur_node < 0)
1447	{
1448	re_node_set_free (&eps_via_nodes);
1449	regmatch_list_free (&prev_match);
1450	free_fail_stack_return (fs);
1451	return REG_NOMATCH;
1452	}
1453	}
1454	}
1455	re_node_set_free (&eps_via_nodes);
1456	regmatch_list_free (&prev_match);
1457	return free_fail_stack_return (fs);
1458	}
1459
1460	static reg_errcode_t
1461	free_fail_stack_return (struct re_fail_stack_t *fs)
1462	{
1463	if (fs)
1464	{
1465	Idx fs_idx;
1466	for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1467	{
1468	re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1469	re_free (fs->stack[fs_idx].regs);
1470	}
1471	re_free (fs->stack);
1472	}
1473	return REG_NOERROR;
1474	}
1475
1476	static void
1477	update_regs (const re_dfa_t dfa, regmatch_t pmatch,
1478	regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1479	{
1480	int type = dfa->nodes[cur_node].type;
1481	if (type == OP_OPEN_SUBEXP)
1482	{
1483	Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1484
1485	/* We are at the first node of this sub expression. */
1486	if (reg_num < nmatch)
1487	{
1488	pmatch[reg_num].rm_so = cur_idx;
1489	pmatch[reg_num].rm_eo = -1;
1490	}
1491	}
1492	else if (type == OP_CLOSE_SUBEXP)
1493	{
1494	/* We are at the last node of this sub expression. */
1495	Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1496	if (reg_num < nmatch)
1497	{
1498	if (pmatch[reg_num].rm_so < cur_idx)
1499	{
1500	pmatch[reg_num].rm_eo = cur_idx;
1501	/* This is a non-empty match or we are not inside an optional
1502	subexpression. Accept this right away. */
1503	memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1504	}
1505	else
1506	{
1507	if (dfa->nodes[cur_node].opt_subexp
1508	&& prev_idx_match[reg_num].rm_so != -1)
1509	/* We transited through an empty match for an optional
1510	subexpression, like (a?)*, and this is not the subexp's
1511	first match. Copy back the old content of the registers
1512	so that matches of an inner subexpression are undone as
1513	well, like in ((a?)). /
1514	memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1515	else
1516	/* We completed a subexpression, but it may be part of
1517	an optional one, so do not update PREV_IDX_MATCH. */
1518	pmatch[reg_num].rm_eo = cur_idx;
1519	}
1520	}
1521	}
1522	}
1523
1524	/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1525	and sift the nodes in each states according to the following rules.
1526	Updated state_log will be wrote to STATE_LOG.
1527
1528	Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1529	1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1530	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1531	the LAST_NODE, we throw away the node 'a'.
1532	2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1533	string 's' and transit to 'b':
1534	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1535	away the node 'a'.
1536	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1537	thrown away, we throw away the node 'a'.
1538	3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1539	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1540	node 'a'.
1541	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1542	we throw away the node 'a'. */
1543
1544	#define STATE_NODE_CONTAINS(state,node) \
1545	((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1546
1547	static reg_errcode_t
1548	sift_states_backward (const re_match_context_t mctx, re_sift_context_t sctx)
1549	{
1550	reg_errcode_t err;
1551	int null_cnt = 0;
1552	Idx str_idx = sctx->last_str_idx;
1553	re_node_set cur_dest;
1554
1555	DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1556
1557	/* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1558	transit to the last_node and the last_node itself. */
1559	err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1560	if (__glibc_unlikely (err != REG_NOERROR))
1561	return err;
1562	err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1563	if (__glibc_unlikely (err != REG_NOERROR))
1564	goto free_return;
1565
1566	/* Then check each states in the state_log. */
1567	while (str_idx > 0)
1568	{
1569	/* Update counters. */
1570	null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1571	if (null_cnt > mctx->max_mb_elem_len)
1572	{
1573	memset (sctx->sifted_states, '\0',
1574	sizeof (re_dfastate_t ) str_idx);
1575	re_node_set_free (&cur_dest);
1576	return REG_NOERROR;
1577	}
1578	re_node_set_empty (&cur_dest);
1579	--str_idx;
1580
1581	if (mctx->state_log[str_idx])
1582	{
1583	err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1584	if (__glibc_unlikely (err != REG_NOERROR))
1585	goto free_return;
1586	}
1587
1588	/* Add all the nodes which satisfy the following conditions:
1589	- It can epsilon transit to a node in CUR_DEST.
1590	- It is in CUR_SRC.
1591	And update state_log. */
1592	err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1593	if (__glibc_unlikely (err != REG_NOERROR))
1594	goto free_return;
1595	}
1596	err = REG_NOERROR;
1597	free_return:
1598	re_node_set_free (&cur_dest);
1599	return err;
1600	}
1601
1602	static reg_errcode_t
1603	__attribute_warn_unused_result__
1604	build_sifted_states (const re_match_context_t mctx, re_sift_context_t sctx,
1605	Idx str_idx, re_node_set *cur_dest)
1606	{
1607	const re_dfa_t *const dfa = mctx->dfa;
1608	const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1609	Idx i;
1610
1611	/* Then build the next sifted state.
1612	We build the next sifted state on 'cur_dest', and update
1613	'sifted_states[str_idx]' with 'cur_dest'.
1614	Note:
1615	'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1616	'cur_src' points the node_set of the old 'state_log[str_idx]'
1617	(with the epsilon nodes pre-filtered out). */
1618	for (i = 0; i < cur_src->nelem; i++)
1619	{
1620	Idx prev_node = cur_src->elems[i];
1621	int naccepted = 0;
1622	bool ok;
1623	DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1624
1625	/* If the node may accept "multi byte". */
1626	if (dfa->nodes[prev_node].accept_mb)
1627	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1628	str_idx, sctx->last_str_idx);
1629
1630	/* We don't check backreferences here.
1631	See update_cur_sifted_state(). */
1632	if (!naccepted
1633	&& check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1634	&& STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1635	dfa->nexts[prev_node]))
1636	naccepted = 1;
1637
1638	if (naccepted == 0)
1639	continue;
1640
1641	if (sctx->limits.nelem)
1642	{
1643	Idx to_idx = str_idx + naccepted;
1644	if (check_dst_limits (mctx, &sctx->limits,
1645	dfa->nexts[prev_node], to_idx,
1646	prev_node, str_idx))
1647	continue;
1648	}
1649	ok = re_node_set_insert (cur_dest, prev_node);
1650	if (__glibc_unlikely (! ok))
1651	return REG_ESPACE;
1652	}
1653
1654	return REG_NOERROR;
1655	}
1656
1657	/* Helper functions. */
1658
1659	static reg_errcode_t
1660	clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1661	{
1662	Idx top = mctx->state_log_top;
1663
1664	if ((next_state_log_idx >= mctx->input.bufs_len
1665	&& mctx->input.bufs_len < mctx->input.len)
1666	\|\| (next_state_log_idx >= mctx->input.valid_len
1667	&& mctx->input.valid_len < mctx->input.len))
1668	{
1669	reg_errcode_t err;
1670	err = extend_buffers (mctx, next_state_log_idx + 1);
1671	if (__glibc_unlikely (err != REG_NOERROR))
1672	return err;
1673	}
1674
1675	if (top < next_state_log_idx)
1676	{
1677	DEBUG_ASSERT (mctx->state_log != NULL);
1678	memset (mctx->state_log + top + 1, '\0',
1679	sizeof (re_dfastate_t ) (next_state_log_idx - top));
1680	mctx->state_log_top = next_state_log_idx;
1681	}
1682	return REG_NOERROR;
1683	}
1684
1685	static reg_errcode_t
1686	merge_state_array (const re_dfa_t dfa, re_dfastate_t *dst,
1687	re_dfastate_t **src, Idx num)
1688	{
1689	Idx st_idx;
1690	reg_errcode_t err;
1691	for (st_idx = 0; st_idx < num; ++st_idx)
1692	{
1693	if (dst[st_idx] == NULL)
1694	dst[st_idx] = src[st_idx];
1695	else if (src[st_idx] != NULL)
1696	{
1697	re_node_set merged_set;
1698	err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1699	&src[st_idx]->nodes);
1700	if (__glibc_unlikely (err != REG_NOERROR))
1701	return err;
1702	dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1703	re_node_set_free (&merged_set);
1704	if (__glibc_unlikely (err != REG_NOERROR))
1705	return err;
1706	}
1707	}
1708	return REG_NOERROR;
1709	}
1710
1711	static reg_errcode_t
1712	update_cur_sifted_state (const re_match_context_t *mctx,
1713	re_sift_context_t *sctx, Idx str_idx,
1714	re_node_set *dest_nodes)
1715	{
1716	const re_dfa_t *const dfa = mctx->dfa;
1717	reg_errcode_t err = REG_NOERROR;
1718	const re_node_set *candidates;
1719	candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1720	: &mctx->state_log[str_idx]->nodes);
1721
1722	if (dest_nodes->nelem == 0)
1723	sctx->sifted_states[str_idx] = NULL;
1724	else
1725	{
1726	if (candidates)
1727	{
1728	/* At first, add the nodes which can epsilon transit to a node in
1729	DEST_NODE. */
1730	err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1731	if (__glibc_unlikely (err != REG_NOERROR))
1732	return err;
1733
1734	/* Then, check the limitations in the current sift_context. */
1735	if (sctx->limits.nelem)
1736	{
1737	err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1738	mctx->bkref_ents, str_idx);
1739	if (__glibc_unlikely (err != REG_NOERROR))
1740	return err;
1741	}
1742	}
1743
1744	sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1745	if (__glibc_unlikely (err != REG_NOERROR))
1746	return err;
1747	}
1748
1749	if (candidates && mctx->state_log[str_idx]->has_backref)
1750	{
1751	err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1752	if (__glibc_unlikely (err != REG_NOERROR))
1753	return err;
1754	}
1755	return REG_NOERROR;
1756	}
1757
1758	static reg_errcode_t
1759	__attribute_warn_unused_result__
1760	add_epsilon_src_nodes (const re_dfa_t dfa, re_node_set dest_nodes,
1761	const re_node_set *candidates)
1762	{
1763	reg_errcode_t err = REG_NOERROR;
1764	Idx i;
1765
1766	re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1767	if (__glibc_unlikely (err != REG_NOERROR))
1768	return err;
1769
1770	if (!state->inveclosure.alloc)
1771	{
1772	err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1773	if (__glibc_unlikely (err != REG_NOERROR))
1774	return REG_ESPACE;
1775	for (i = 0; i < dest_nodes->nelem; i++)
1776	{
1777	err = re_node_set_merge (&state->inveclosure,
1778	dfa->inveclosures + dest_nodes->elems[i]);
1779	if (__glibc_unlikely (err != REG_NOERROR))
1780	return REG_ESPACE;
1781	}
1782	}
1783	return re_node_set_add_intersect (dest_nodes, candidates,
1784	&state->inveclosure);
1785	}
1786
1787	static reg_errcode_t
1788	sub_epsilon_src_nodes (const re_dfa_t dfa, Idx node, re_node_set dest_nodes,
1789	const re_node_set *candidates)
1790	{
1791	Idx ecl_idx;
1792	reg_errcode_t err;
1793	re_node_set *inv_eclosure = dfa->inveclosures + node;
1794	re_node_set except_nodes;
1795	re_node_set_init_empty (&except_nodes);
1796	for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1797	{
1798	Idx cur_node = inv_eclosure->elems[ecl_idx];
1799	if (cur_node == node)
1800	continue;
1801	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1802	{
1803	Idx edst1 = dfa->edests[cur_node].elems[0];
1804	Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1805	? dfa->edests[cur_node].elems[1] : -1);
1806	if ((!re_node_set_contains (inv_eclosure, edst1)
1807	&& re_node_set_contains (dest_nodes, edst1))
1808	\|\| (edst2 > 0
1809	&& !re_node_set_contains (inv_eclosure, edst2)
1810	&& re_node_set_contains (dest_nodes, edst2)))
1811	{
1812	err = re_node_set_add_intersect (&except_nodes, candidates,
1813	dfa->inveclosures + cur_node);
1814	if (__glibc_unlikely (err != REG_NOERROR))
1815	{
1816	re_node_set_free (&except_nodes);
1817	return err;
1818	}
1819	}
1820	}
1821	}
1822	for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823	{
1824	Idx cur_node = inv_eclosure->elems[ecl_idx];
1825	if (!re_node_set_contains (&except_nodes, cur_node))
1826	{
1827	Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1828	re_node_set_remove_at (dest_nodes, idx);
1829	}
1830	}
1831	re_node_set_free (&except_nodes);
1832	return REG_NOERROR;
1833	}
1834
1835	static bool
1836	check_dst_limits (const re_match_context_t mctx, const re_node_set limits,
1837	Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1838	{
1839	const re_dfa_t *const dfa = mctx->dfa;
1840	Idx lim_idx, src_pos, dst_pos;
1841
1842	Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1843	Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1844	for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1845	{
1846	Idx subexp_idx;
1847	struct re_backref_cache_entry *ent;
1848	ent = mctx->bkref_ents + limits->elems[lim_idx];
1849	subexp_idx = dfa->nodes[ent->node].opr.idx;
1850
1851	dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1852	subexp_idx, dst_node, dst_idx,
1853	dst_bkref_idx);
1854	src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1855	subexp_idx, src_node, src_idx,
1856	src_bkref_idx);
1857
1858	/* In case of:
1859	<src> <dst> ( <subexp> )
1860	( <subexp> ) <src> <dst>
1861	( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1862	if (src_pos == dst_pos)
1863	continue; /* This is unrelated limitation. */
1864	else
1865	return true;
1866	}
1867	return false;
1868	}
1869
1870	static int
1871	check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1872	Idx subexp_idx, Idx from_node, Idx bkref_idx)
1873	{
1874	const re_dfa_t *const dfa = mctx->dfa;
1875	const re_node_set *eclosures = dfa->eclosures + from_node;
1876	Idx node_idx;
1877
1878	/* Else, we are on the boundary: examine the nodes on the epsilon
1879	closure. */
1880	for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1881	{
1882	Idx node = eclosures->elems[node_idx];
1883	switch (dfa->nodes[node].type)
1884	{
1885	case OP_BACK_REF:
1886	if (bkref_idx != -1)
1887	{
1888	struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1889	do
1890	{
1891	Idx dst;
1892	int cpos;
1893
1894	if (ent->node != node)
1895	continue;
1896
1897	if (subexp_idx < BITSET_WORD_BITS
1898	&& !(ent->eps_reachable_subexps_map
1899	& ((bitset_word_t) 1 << subexp_idx)))
1900	continue;
1901
1902	/* Recurse trying to reach the OP_OPEN_SUBEXP and
1903	OP_CLOSE_SUBEXP cases below. But, if the
1904	destination node is the same node as the source
1905	node, don't recurse because it would cause an
1906	infinite loop: a regex that exhibits this behavior
1907	is ()\1\1 */
1908	dst = dfa->edests[node].elems[0];
1909	if (dst == from_node)
1910	{
1911	if (boundaries & 1)
1912	return -1;
1913	else /* if (boundaries & 2) */
1914	return 0;
1915	}
1916
1917	cpos =
1918	check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1919	dst, bkref_idx);
1920	if (cpos == -1 /* && (boundaries & 1) */)
1921	return -1;
1922	if (cpos == 0 && (boundaries & 2))
1923	return 0;
1924
1925	if (subexp_idx < BITSET_WORD_BITS)
1926	ent->eps_reachable_subexps_map
1927	&= ~((bitset_word_t) 1 << subexp_idx);
1928	}
1929	while (ent++->more);
1930	}
1931	break;
1932
1933	case OP_OPEN_SUBEXP:
1934	if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1935	return -1;
1936	break;
1937
1938	case OP_CLOSE_SUBEXP:
1939	if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1940	return 0;
1941	break;
1942
1943	default:
1944	break;
1945	}
1946	}
1947
1948	return (boundaries & 2) ? 1 : 0;
1949	}
1950
1951	static int
1952	check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1953	Idx subexp_idx, Idx from_node, Idx str_idx,
1954	Idx bkref_idx)
1955	{
1956	struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1957	int boundaries;
1958
1959	/* If we are outside the range of the subexpression, return -1 or 1. */
1960	if (str_idx < lim->subexp_from)
1961	return -1;
1962
1963	if (lim->subexp_to < str_idx)
1964	return 1;
1965
1966	/* If we are within the subexpression, return 0. */
1967	boundaries = (str_idx == lim->subexp_from);
1968	boundaries \|= (str_idx == lim->subexp_to) << 1;
1969	if (boundaries == 0)
1970	return 0;
1971
1972	/* Else, examine epsilon closure. */
1973	return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1974	from_node, bkref_idx);
1975	}
1976
1977	/* Check the limitations of sub expressions LIMITS, and remove the nodes
1978	which are against limitations from DEST_NODES. */
1979
1980	static reg_errcode_t
1981	check_subexp_limits (const re_dfa_t dfa, re_node_set dest_nodes,
1982	const re_node_set candidates, re_node_set limits,
1983	struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1984	{
1985	reg_errcode_t err;
1986	Idx node_idx, lim_idx;
1987
1988	for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1989	{
1990	Idx subexp_idx;
1991	struct re_backref_cache_entry *ent;
1992	ent = bkref_ents + limits->elems[lim_idx];
1993
1994	if (str_idx <= ent->subexp_from \|\| ent->str_idx < str_idx)
1995	continue; /* This is unrelated limitation. */
1996
1997	subexp_idx = dfa->nodes[ent->node].opr.idx;
1998	if (ent->subexp_to == str_idx)
1999	{
2000	Idx ops_node = -1;
2001	Idx cls_node = -1;
2002	for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2003	{
2004	Idx node = dest_nodes->elems[node_idx];
2005	re_token_type_t type = dfa->nodes[node].type;
2006	if (type == OP_OPEN_SUBEXP
2007	&& subexp_idx == dfa->nodes[node].opr.idx)
2008	ops_node = node;
2009	else if (type == OP_CLOSE_SUBEXP
2010	&& subexp_idx == dfa->nodes[node].opr.idx)
2011	cls_node = node;
2012	}
2013
2014	/* Check the limitation of the open subexpression. */
2015	/* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2016	if (ops_node >= 0)
2017	{
2018	err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2019	candidates);
2020	if (__glibc_unlikely (err != REG_NOERROR))
2021	return err;
2022	}
2023
2024	/* Check the limitation of the close subexpression. */
2025	if (cls_node >= 0)
2026	for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2027	{
2028	Idx node = dest_nodes->elems[node_idx];
2029	if (!re_node_set_contains (dfa->inveclosures + node,
2030	cls_node)
2031	&& !re_node_set_contains (dfa->eclosures + node,
2032	cls_node))
2033	{
2034	/* It is against this limitation.
2035	Remove it form the current sifted state. */
2036	err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2037	candidates);
2038	if (__glibc_unlikely (err != REG_NOERROR))
2039	return err;
2040	--node_idx;
2041	}
2042	}
2043	}
2044	else /* (ent->subexp_to != str_idx) */
2045	{
2046	for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2047	{
2048	Idx node = dest_nodes->elems[node_idx];
2049	re_token_type_t type = dfa->nodes[node].type;
2050	if (type == OP_CLOSE_SUBEXP \|\| type == OP_OPEN_SUBEXP)
2051	{
2052	if (subexp_idx != dfa->nodes[node].opr.idx)
2053	continue;
2054	/* It is against this limitation.
2055	Remove it form the current sifted state. */
2056	err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2057	candidates);
2058	if (__glibc_unlikely (err != REG_NOERROR))
2059	return err;
2060	}
2061	}
2062	}
2063	}
2064	return REG_NOERROR;
2065	}
2066
2067	static reg_errcode_t
2068	__attribute_warn_unused_result__
2069	sift_states_bkref (const re_match_context_t mctx, re_sift_context_t sctx,
2070	Idx str_idx, const re_node_set *candidates)
2071	{
2072	const re_dfa_t *const dfa = mctx->dfa;
2073	reg_errcode_t err;
2074	Idx node_idx, node;
2075	re_sift_context_t local_sctx;
2076	Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2077
2078	if (first_idx == -1)
2079	return REG_NOERROR;
2080
2081	local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2082
2083	for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2084	{
2085	Idx enabled_idx;
2086	re_token_type_t type;
2087	struct re_backref_cache_entry *entry;
2088	node = candidates->elems[node_idx];
2089	type = dfa->nodes[node].type;
2090	/* Avoid infinite loop for the REs like "()\1+". */
2091	if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2092	continue;
2093	if (type != OP_BACK_REF)
2094	continue;
2095
2096	entry = mctx->bkref_ents + first_idx;
2097	enabled_idx = first_idx;
2098	do
2099	{
2100	Idx subexp_len;
2101	Idx to_idx;
2102	Idx dst_node;
2103	bool ok;
2104	re_dfastate_t *cur_state;
2105
2106	if (entry->node != node)
2107	continue;
2108	subexp_len = entry->subexp_to - entry->subexp_from;
2109	to_idx = str_idx + subexp_len;
2110	dst_node = (subexp_len ? dfa->nexts[node]
2111	: dfa->edests[node].elems[0]);
2112
2113	if (to_idx > sctx->last_str_idx
2114	\|\| sctx->sifted_states[to_idx] == NULL
2115	\|\| !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2116	\|\| check_dst_limits (mctx, &sctx->limits, node,
2117	str_idx, dst_node, to_idx))
2118	continue;
2119
2120	if (local_sctx.sifted_states == NULL)
2121	{
2122	local_sctx = *sctx;
2123	err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2124	if (__glibc_unlikely (err != REG_NOERROR))
2125	goto free_return;
2126	}
2127	local_sctx.last_node = node;
2128	local_sctx.last_str_idx = str_idx;
2129	ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2130	if (__glibc_unlikely (! ok))
2131	{
2132	err = REG_ESPACE;
2133	goto free_return;
2134	}
2135	cur_state = local_sctx.sifted_states[str_idx];
2136	err = sift_states_backward (mctx, &local_sctx);
2137	if (__glibc_unlikely (err != REG_NOERROR))
2138	goto free_return;
2139	if (sctx->limited_states != NULL)
2140	{
2141	err = merge_state_array (dfa, sctx->limited_states,
2142	local_sctx.sifted_states,
2143	str_idx + 1);
2144	if (__glibc_unlikely (err != REG_NOERROR))
2145	goto free_return;
2146	}
2147	local_sctx.sifted_states[str_idx] = cur_state;
2148	re_node_set_remove (&local_sctx.limits, enabled_idx);
2149
2150	/* mctx->bkref_ents may have changed, reload the pointer. */
2151	entry = mctx->bkref_ents + enabled_idx;
2152	}
2153	while (enabled_idx++, entry++->more);
2154	}
2155	err = REG_NOERROR;
2156	free_return:
2157	if (local_sctx.sifted_states != NULL)
2158	{
2159	re_node_set_free (&local_sctx.limits);
2160	}
2161
2162	return err;
2163	}
2164
2165
2166	static int
2167	sift_states_iter_mb (const re_match_context_t mctx, re_sift_context_t sctx,
2168	Idx node_idx, Idx str_idx, Idx max_str_idx)
2169	{
2170	const re_dfa_t *const dfa = mctx->dfa;
2171	int naccepted;
2172	/* Check the node can accept "multi byte". */
2173	naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2174	if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2175	&& !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2176	dfa->nexts[node_idx]))
2177	/* The node can't accept the "multi byte", or the
2178	destination was already thrown away, then the node
2179	couldn't accept the current input "multi byte". */
2180	naccepted = 0;
2181	/* Otherwise, it is sure that the node could accept
2182	'naccepted' bytes input. */
2183	return naccepted;
2184	}
2185
2186
2187	/* Functions for state transition. */
2188
2189	/* Return the next state to which the current state STATE will transit by
2190	accepting the current input byte, and update STATE_LOG if necessary.
2191	Return NULL on failure.
2192	If STATE can accept a multibyte char/collating element/back reference
2193	update the destination of STATE_LOG. */
2194
2195	static re_dfastate_t *
2196	__attribute_warn_unused_result__
2197	transit_state (reg_errcode_t err, re_match_context_t mctx,
2198	re_dfastate_t *state)
2199	{
2200	re_dfastate_t **trtable;
2201	unsigned char ch;
2202
2203	/* If the current state can accept multibyte. */
2204	if (__glibc_unlikely (state->accept_mb))
2205	{
2206	*err = transit_state_mb (mctx, state);
2207	if (__glibc_unlikely (*err != REG_NOERROR))
2208	return NULL;
2209	}
2210
2211	/* Then decide the next state with the single byte. */
2212	#if 0
2213	if (0)
2214	/* don't use transition table */
2215	return transit_state_sb (err, mctx, state);
2216	#endif
2217
2218	/* Use transition table */
2219	ch = re_string_fetch_byte (&mctx->input);
2220	for (;;)
2221	{
2222	trtable = state->trtable;
2223	if (__glibc_likely (trtable != NULL))
2224	return trtable[ch];
2225
2226	trtable = state->word_trtable;
2227	if (__glibc_likely (trtable != NULL))
2228	{
2229	unsigned int context;
2230	context
2231	= re_string_context_at (&mctx->input,
2232	re_string_cur_idx (&mctx->input) - 1,
2233	mctx->eflags);
2234	if (IS_WORD_CONTEXT (context))
2235	return trtable[ch + SBC_MAX];
2236	else
2237	return trtable[ch];
2238	}
2239
2240	if (!build_trtable (mctx->dfa, state))
2241	{
2242	*err = REG_ESPACE;
2243	return NULL;
2244	}
2245
2246	/* Retry, we now have a transition table. */
2247	}
2248	}
2249
2250	/* Update the state_log if we need */
2251	static re_dfastate_t *
2252	merge_state_with_log (reg_errcode_t err, re_match_context_t mctx,
2253	re_dfastate_t *next_state)
2254	{
2255	const re_dfa_t *const dfa = mctx->dfa;
2256	Idx cur_idx = re_string_cur_idx (&mctx->input);
2257
2258	if (cur_idx > mctx->state_log_top)
2259	{
2260	mctx->state_log[cur_idx] = next_state;
2261	mctx->state_log_top = cur_idx;
2262	}
2263	else if (mctx->state_log[cur_idx] == 0)
2264	{
2265	mctx->state_log[cur_idx] = next_state;
2266	}
2267	else
2268	{
2269	re_dfastate_t *pstate;
2270	unsigned int context;
2271	re_node_set next_nodes, log_nodes, table_nodes = NULL;
2272	/* If (state_log[cur_idx] != 0), it implies that cur_idx is
2273	the destination of a multibyte char/collating element/
2274	back reference. Then the next state is the union set of
2275	these destinations and the results of the transition table. */
2276	pstate = mctx->state_log[cur_idx];
2277	log_nodes = pstate->entrance_nodes;
2278	if (next_state != NULL)
2279	{
2280	table_nodes = next_state->entrance_nodes;
2281	*err = re_node_set_init_union (&next_nodes, table_nodes,
2282	log_nodes);
2283	if (__glibc_unlikely (*err != REG_NOERROR))
2284	return NULL;
2285	}
2286	else
2287	next_nodes = *log_nodes;
2288	/* Note: We already add the nodes of the initial state,
2289	then we don't need to add them here. */
2290
2291	context = re_string_context_at (&mctx->input,
2292	re_string_cur_idx (&mctx->input) - 1,
2293	mctx->eflags);
2294	next_state = mctx->state_log[cur_idx]
2295	= re_acquire_state_context (err, dfa, &next_nodes, context);
2296	/* We don't need to check errors here, since the return value of
2297	this function is next_state and ERR is already set. */
2298
2299	if (table_nodes != NULL)
2300	re_node_set_free (&next_nodes);
2301	}
2302
2303	if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2304	{
2305	/* Check OP_OPEN_SUBEXP in the current state in case that we use them
2306	later. We must check them here, since the back references in the
2307	next state might use them. */
2308	*err = check_subexp_matching_top (mctx, &next_state->nodes,
2309	cur_idx);
2310	if (__glibc_unlikely (*err != REG_NOERROR))
2311	return NULL;
2312
2313	/* If the next state has back references. */
2314	if (next_state->has_backref)
2315	{
2316	*err = transit_state_bkref (mctx, &next_state->nodes);
2317	if (__glibc_unlikely (*err != REG_NOERROR))
2318	return NULL;
2319	next_state = mctx->state_log[cur_idx];
2320	}
2321	}
2322
2323	return next_state;
2324	}
2325
2326	/* Skip bytes in the input that correspond to part of a
2327	multi-byte match, then look in the log for a state
2328	from which to restart matching. */
2329	static re_dfastate_t *
2330	find_recover_state (reg_errcode_t err, re_match_context_t mctx)
2331	{
2332	re_dfastate_t *cur_state;
2333	do
2334	{
2335	Idx max = mctx->state_log_top;
2336	Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2337
2338	do
2339	{
2340	if (++cur_str_idx > max)
2341	return NULL;
2342	re_string_skip_bytes (&mctx->input, 1);
2343	}
2344	while (mctx->state_log[cur_str_idx] == NULL);
2345
2346	cur_state = merge_state_with_log (err, mctx, NULL);
2347	}
2348	while (*err == REG_NOERROR && cur_state == NULL);
2349	return cur_state;
2350	}
2351
2352	/* Helper functions for transit_state. */
2353
2354	/* From the node set CUR_NODES, pick up the nodes whose types are
2355	OP_OPEN_SUBEXP and which have corresponding back references in the regular
2356	expression. And register them to use them later for evaluating the
2357	corresponding back references. */
2358
2359	static reg_errcode_t
2360	check_subexp_matching_top (re_match_context_t mctx, re_node_set cur_nodes,
2361	Idx str_idx)
2362	{
2363	const re_dfa_t *const dfa = mctx->dfa;
2364	Idx node_idx;
2365	reg_errcode_t err;
2366
2367	/* TODO: This isn't efficient.
2368	Because there might be more than one nodes whose types are
2369	OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2370	nodes.
2371	E.g. RE: (a){2} */
2372	for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2373	{
2374	Idx node = cur_nodes->elems[node_idx];
2375	if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2376	&& dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2377	&& (dfa->used_bkref_map
2378	& ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2379	{
2380	err = match_ctx_add_subtop (mctx, node, str_idx);
2381	if (__glibc_unlikely (err != REG_NOERROR))
2382	return err;
2383	}
2384	}
2385	return REG_NOERROR;
2386	}
2387
2388	#if 0
2389	/* Return the next state to which the current state STATE will transit by
2390	accepting the current input byte. Return NULL on failure. */
2391
2392	static re_dfastate_t *
2393	transit_state_sb (reg_errcode_t err, re_match_context_t mctx,
2394	re_dfastate_t *state)
2395	{
2396	const re_dfa_t *const dfa = mctx->dfa;
2397	re_node_set next_nodes;
2398	re_dfastate_t *next_state;
2399	Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2400	unsigned int context;
2401
2402	*err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2403	if (__glibc_unlikely (*err != REG_NOERROR))
2404	return NULL;
2405	for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2406	{
2407	Idx cur_node = state->nodes.elems[node_cnt];
2408	if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2409	{
2410	*err = re_node_set_merge (&next_nodes,
2411	dfa->eclosures + dfa->nexts[cur_node]);
2412	if (__glibc_unlikely (*err != REG_NOERROR))
2413	{
2414	re_node_set_free (&next_nodes);
2415	return NULL;
2416	}
2417	}
2418	}
2419	context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2420	next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2421	/* We don't need to check errors here, since the return value of
2422	this function is next_state and ERR is already set. */
2423
2424	re_node_set_free (&next_nodes);
2425	re_string_skip_bytes (&mctx->input, 1);
2426	return next_state;
2427	}
2428	#endif
2429
2430	static reg_errcode_t
2431	transit_state_mb (re_match_context_t mctx, re_dfastate_t pstate)
2432	{
2433	const re_dfa_t *const dfa = mctx->dfa;
2434	reg_errcode_t err;
2435	Idx i;
2436
2437	for (i = 0; i < pstate->nodes.nelem; ++i)
2438	{
2439	re_node_set dest_nodes, *new_nodes;
2440	Idx cur_node_idx = pstate->nodes.elems[i];
2441	int naccepted;
2442	Idx dest_idx;
2443	unsigned int context;
2444	re_dfastate_t *dest_state;
2445
2446	if (!dfa->nodes[cur_node_idx].accept_mb)
2447	continue;
2448
2449	if (dfa->nodes[cur_node_idx].constraint)
2450	{
2451	context = re_string_context_at (&mctx->input,
2452	re_string_cur_idx (&mctx->input),
2453	mctx->eflags);
2454	if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2455	context))
2456	continue;
2457	}
2458
2459	/* How many bytes the node can accept? */
2460	naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2461	re_string_cur_idx (&mctx->input));
2462	if (naccepted == 0)
2463	continue;
2464
2465	/* The node can accepts 'naccepted' bytes. */
2466	dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2467	mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2468	: mctx->max_mb_elem_len);
2469	err = clean_state_log_if_needed (mctx, dest_idx);
2470	if (__glibc_unlikely (err != REG_NOERROR))
2471	return err;
2472	DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2473	new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2474
2475	dest_state = mctx->state_log[dest_idx];
2476	if (dest_state == NULL)
2477	dest_nodes = *new_nodes;
2478	else
2479	{
2480	err = re_node_set_init_union (&dest_nodes,
2481	dest_state->entrance_nodes, new_nodes);
2482	if (__glibc_unlikely (err != REG_NOERROR))
2483	return err;
2484	}
2485	context = re_string_context_at (&mctx->input, dest_idx - 1,
2486	mctx->eflags);
2487	mctx->state_log[dest_idx]
2488	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2489	if (dest_state != NULL)
2490	re_node_set_free (&dest_nodes);
2491	if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2492	&& err != REG_NOERROR))
2493	return err;
2494	}
2495	return REG_NOERROR;
2496	}
2497
2498	static reg_errcode_t
2499	transit_state_bkref (re_match_context_t mctx, const re_node_set nodes)
2500	{
2501	const re_dfa_t *const dfa = mctx->dfa;
2502	reg_errcode_t err;
2503	Idx i;
2504	Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2505
2506	for (i = 0; i < nodes->nelem; ++i)
2507	{
2508	Idx dest_str_idx, prev_nelem, bkc_idx;
2509	Idx node_idx = nodes->elems[i];
2510	unsigned int context;
2511	const re_token_t *node = dfa->nodes + node_idx;
2512	re_node_set *new_dest_nodes;
2513
2514	/* Check whether 'node' is a backreference or not. */
2515	if (node->type != OP_BACK_REF)
2516	continue;
2517
2518	if (node->constraint)
2519	{
2520	context = re_string_context_at (&mctx->input, cur_str_idx,
2521	mctx->eflags);
2522	if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2523	continue;
2524	}
2525
2526	/* 'node' is a backreference.
2527	Check the substring which the substring matched. */
2528	bkc_idx = mctx->nbkref_ents;
2529	err = get_subexp (mctx, node_idx, cur_str_idx);
2530	if (__glibc_unlikely (err != REG_NOERROR))
2531	goto free_return;
2532
2533	/* And add the epsilon closures (which is 'new_dest_nodes') of
2534	the backreference to appropriate state_log. */
2535	DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2536	for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2537	{
2538	Idx subexp_len;
2539	re_dfastate_t *dest_state;
2540	struct re_backref_cache_entry *bkref_ent;
2541	bkref_ent = mctx->bkref_ents + bkc_idx;
2542	if (bkref_ent->node != node_idx \|\| bkref_ent->str_idx != cur_str_idx)
2543	continue;
2544	subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2545	new_dest_nodes = (subexp_len == 0
2546	? dfa->eclosures + dfa->edests[node_idx].elems[0]
2547	: dfa->eclosures + dfa->nexts[node_idx]);
2548	dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2549	- bkref_ent->subexp_from);
2550	context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2551	mctx->eflags);
2552	dest_state = mctx->state_log[dest_str_idx];
2553	prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2554	: mctx->state_log[cur_str_idx]->nodes.nelem);
2555	/* Add 'new_dest_node' to state_log. */
2556	if (dest_state == NULL)
2557	{
2558	mctx->state_log[dest_str_idx]
2559	= re_acquire_state_context (&err, dfa, new_dest_nodes,
2560	context);
2561	if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2562	&& err != REG_NOERROR))
2563	goto free_return;
2564	}
2565	else
2566	{
2567	re_node_set dest_nodes;
2568	err = re_node_set_init_union (&dest_nodes,
2569	dest_state->entrance_nodes,
2570	new_dest_nodes);
2571	if (__glibc_unlikely (err != REG_NOERROR))
2572	{
2573	re_node_set_free (&dest_nodes);
2574	goto free_return;
2575	}
2576	mctx->state_log[dest_str_idx]
2577	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2578	re_node_set_free (&dest_nodes);
2579	if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2580	&& err != REG_NOERROR))
2581	goto free_return;
2582	}
2583	/* We need to check recursively if the backreference can epsilon
2584	transit. */
2585	if (subexp_len == 0
2586	&& mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2587	{
2588	err = check_subexp_matching_top (mctx, new_dest_nodes,
2589	cur_str_idx);
2590	if (__glibc_unlikely (err != REG_NOERROR))
2591	goto free_return;
2592	err = transit_state_bkref (mctx, new_dest_nodes);
2593	if (__glibc_unlikely (err != REG_NOERROR))
2594	goto free_return;
2595	}
2596	}
2597	}
2598	err = REG_NOERROR;
2599	free_return:
2600	return err;
2601	}
2602
2603	/* Enumerate all the candidates which the backreference BKREF_NODE can match
2604	at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2605	Note that we might collect inappropriate candidates here.
2606	However, the cost of checking them strictly here is too high, then we
2607	delay these checking for prune_impossible_nodes(). */
2608
2609	static reg_errcode_t
2610	__attribute_warn_unused_result__
2611	get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2612	{
2613	const re_dfa_t *const dfa = mctx->dfa;
2614	Idx subexp_num, sub_top_idx;
2615	const char buf = (const char ) re_string_get_buffer (&mctx->input);
2616	/* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2617	Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2618	if (cache_idx != -1)
2619	{
2620	const struct re_backref_cache_entry *entry
2621	= mctx->bkref_ents + cache_idx;
2622	do
2623	if (entry->node == bkref_node)
2624	return REG_NOERROR; /* We already checked it. */
2625	while (entry++->more);
2626	}
2627
2628	subexp_num = dfa->nodes[bkref_node].opr.idx;
2629
2630	/* For each sub expression */
2631	for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2632	{
2633	reg_errcode_t err;
2634	re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2635	re_sub_match_last_t *sub_last;
2636	Idx sub_last_idx, sl_str, bkref_str_off;
2637
2638	if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2639	continue; /* It isn't related. */
2640
2641	sl_str = sub_top->str_idx;
2642	bkref_str_off = bkref_str_idx;
2643	/* At first, check the last node of sub expressions we already
2644	evaluated. */
2645	for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2646	{
2647	regoff_t sl_str_diff;
2648	sub_last = sub_top->lasts[sub_last_idx];
2649	sl_str_diff = sub_last->str_idx - sl_str;
2650	/* The matched string by the sub expression match with the substring
2651	at the back reference? */
2652	if (sl_str_diff > 0)
2653	{
2654	if (__glibc_unlikely (bkref_str_off + sl_str_diff
2655	> mctx->input.valid_len))
2656	{
2657	/* Not enough chars for a successful match. */
2658	if (bkref_str_off + sl_str_diff > mctx->input.len)
2659	break;
2660
2661	err = clean_state_log_if_needed (mctx,
2662	bkref_str_off
2663	+ sl_str_diff);
2664	if (__glibc_unlikely (err != REG_NOERROR))
2665	return err;
2666	buf = (const char *) re_string_get_buffer (&mctx->input);
2667	}
2668	if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2669	/* We don't need to search this sub expression any more. */
2670	break;
2671	}
2672	bkref_str_off += sl_str_diff;
2673	sl_str += sl_str_diff;
2674	err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2675	bkref_str_idx);
2676
2677	/* Reload buf, since the preceding call might have reallocated
2678	the buffer. */
2679	buf = (const char *) re_string_get_buffer (&mctx->input);
2680
2681	if (err == REG_NOMATCH)
2682	continue;
2683	if (__glibc_unlikely (err != REG_NOERROR))
2684	return err;
2685	}
2686
2687	if (sub_last_idx < sub_top->nlasts)
2688	continue;
2689	if (sub_last_idx > 0)
2690	++sl_str;
2691	/* Then, search for the other last nodes of the sub expression. */
2692	for (; sl_str <= bkref_str_idx; ++sl_str)
2693	{
2694	Idx cls_node;
2695	regoff_t sl_str_off;
2696	const re_node_set *nodes;
2697	sl_str_off = sl_str - sub_top->str_idx;
2698	/* The matched string by the sub expression match with the substring
2699	at the back reference? */
2700	if (sl_str_off > 0)
2701	{
2702	if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2703	{
2704	/* If we are at the end of the input, we cannot match. */
2705	if (bkref_str_off >= mctx->input.len)
2706	break;
2707
2708	err = extend_buffers (mctx, bkref_str_off + 1);
2709	if (__glibc_unlikely (err != REG_NOERROR))
2710	return err;
2711
2712	buf = (const char *) re_string_get_buffer (&mctx->input);
2713	}
2714	if (buf [bkref_str_off++] != buf[sl_str - 1])
2715	break; /* We don't need to search this sub expression
2716	any more. */
2717	}
2718	if (mctx->state_log[sl_str] == NULL)
2719	continue;
2720	/* Does this state have a ')' of the sub expression? */
2721	nodes = &mctx->state_log[sl_str]->nodes;
2722	cls_node = find_subexp_node (dfa, nodes, subexp_num,
2723	OP_CLOSE_SUBEXP);
2724	if (cls_node == -1)
2725	continue; /* No. */
2726	if (sub_top->path == NULL)
2727	{
2728	sub_top->path = calloc (sizeof (state_array_t),
2729	sl_str - sub_top->str_idx + 1);
2730	if (sub_top->path == NULL)
2731	return REG_ESPACE;
2732	}
2733	/* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2734	in the current context? */
2735	err = check_arrival (mctx, sub_top->path, sub_top->node,
2736	sub_top->str_idx, cls_node, sl_str,
2737	OP_CLOSE_SUBEXP);
2738	if (err == REG_NOMATCH)
2739	continue;
2740	if (__glibc_unlikely (err != REG_NOERROR))
2741	return err;
2742	sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2743	if (__glibc_unlikely (sub_last == NULL))
2744	return REG_ESPACE;
2745	err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2746	bkref_str_idx);
2747	buf = (const char *) re_string_get_buffer (&mctx->input);
2748	if (err == REG_NOMATCH)
2749	continue;
2750	if (__glibc_unlikely (err != REG_NOERROR))
2751	return err;
2752	}
2753	}
2754	return REG_NOERROR;
2755	}
2756
2757	/* Helper functions for get_subexp(). */
2758
2759	/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2760	If it can arrive, register the sub expression expressed with SUB_TOP
2761	and SUB_LAST. */
2762
2763	static reg_errcode_t
2764	get_subexp_sub (re_match_context_t mctx, const re_sub_match_top_t sub_top,
2765	re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2766	{
2767	reg_errcode_t err;
2768	Idx to_idx;
2769	/* Can the subexpression arrive the back reference? */
2770	err = check_arrival (mctx, &sub_last->path, sub_last->node,
2771	sub_last->str_idx, bkref_node, bkref_str,
2772	OP_OPEN_SUBEXP);
2773	if (err != REG_NOERROR)
2774	return err;
2775	err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2776	sub_last->str_idx);
2777	if (__glibc_unlikely (err != REG_NOERROR))
2778	return err;
2779	to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2780	return clean_state_log_if_needed (mctx, to_idx);
2781	}
2782
2783	/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2784	Search '(' if FL_OPEN, or search ')' otherwise.
2785	TODO: This function isn't efficient...
2786	Because there might be more than one nodes whose types are
2787	OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2788	nodes.
2789	E.g. RE: (a){2} */
2790
2791	static Idx
2792	find_subexp_node (const re_dfa_t dfa, const re_node_set nodes,
2793	Idx subexp_idx, int type)
2794	{
2795	Idx cls_idx;
2796	for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2797	{
2798	Idx cls_node = nodes->elems[cls_idx];
2799	const re_token_t *node = dfa->nodes + cls_node;
2800	if (node->type == type
2801	&& node->opr.idx == subexp_idx)
2802	return cls_node;
2803	}
2804	return -1;
2805	}
2806
2807	/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2808	LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2809	heavily reused.
2810	Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2811	REG_ESPACE if memory is exhausted. */
2812
2813	static reg_errcode_t
2814	__attribute_warn_unused_result__
2815	check_arrival (re_match_context_t mctx, state_array_t path, Idx top_node,
2816	Idx top_str, Idx last_node, Idx last_str, int type)
2817	{
2818	const re_dfa_t *const dfa = mctx->dfa;
2819	reg_errcode_t err = REG_NOERROR;
2820	Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2821	re_dfastate_t *cur_state = NULL;
2822	re_node_set *cur_nodes, next_nodes;
2823	re_dfastate_t **backup_state_log;
2824	unsigned int context;
2825
2826	subexp_num = dfa->nodes[top_node].opr.idx;
2827	/* Extend the buffer if we need. */
2828	if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2829	{
2830	re_dfastate_t **new_array;
2831	Idx old_alloc = path->alloc;
2832	Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2833	Idx new_alloc;
2834	if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2835	return REG_ESPACE;
2836	new_alloc = old_alloc + incr_alloc;
2837	if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2838	return REG_ESPACE;
2839	new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2840	if (__glibc_unlikely (new_array == NULL))
2841	return REG_ESPACE;
2842	path->array = new_array;
2843	path->alloc = new_alloc;
2844	memset (new_array + old_alloc, '\0',
2845	sizeof (re_dfastate_t ) (path->alloc - old_alloc));
2846	}
2847
2848	str_idx = path->next_idx ? path->next_idx : top_str;
2849
2850	/* Temporary modify MCTX. */
2851	backup_state_log = mctx->state_log;
2852	backup_cur_idx = mctx->input.cur_idx;
2853	mctx->state_log = path->array;
2854	mctx->input.cur_idx = str_idx;
2855
2856	/* Setup initial node set. */
2857	context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2858	if (str_idx == top_str)
2859	{
2860	err = re_node_set_init_1 (&next_nodes, top_node);
2861	if (__glibc_unlikely (err != REG_NOERROR))
2862	return err;
2863	err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2864	if (__glibc_unlikely (err != REG_NOERROR))
2865	{
2866	re_node_set_free (&next_nodes);
2867	return err;
2868	}
2869	}
2870	else
2871	{
2872	cur_state = mctx->state_log[str_idx];
2873	if (cur_state && cur_state->has_backref)
2874	{
2875	err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2876	if (__glibc_unlikely (err != REG_NOERROR))
2877	return err;
2878	}
2879	else
2880	re_node_set_init_empty (&next_nodes);
2881	}
2882	if (str_idx == top_str \|\| (cur_state && cur_state->has_backref))
2883	{
2884	if (next_nodes.nelem)
2885	{
2886	err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2887	subexp_num, type);
2888	if (__glibc_unlikely (err != REG_NOERROR))
2889	{
2890	re_node_set_free (&next_nodes);
2891	return err;
2892	}
2893	}
2894	cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2895	if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2896	{
2897	re_node_set_free (&next_nodes);
2898	return err;
2899	}
2900	mctx->state_log[str_idx] = cur_state;
2901	}
2902
2903	for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2904	{
2905	re_node_set_empty (&next_nodes);
2906	if (mctx->state_log[str_idx + 1])
2907	{
2908	err = re_node_set_merge (&next_nodes,
2909	&mctx->state_log[str_idx + 1]->nodes);
2910	if (__glibc_unlikely (err != REG_NOERROR))
2911	{
2912	re_node_set_free (&next_nodes);
2913	return err;
2914	}
2915	}
2916	if (cur_state)
2917	{
2918	err = check_arrival_add_next_nodes (mctx, str_idx,
2919	&cur_state->non_eps_nodes,
2920	&next_nodes);
2921	if (__glibc_unlikely (err != REG_NOERROR))
2922	{
2923	re_node_set_free (&next_nodes);
2924	return err;
2925	}
2926	}
2927	++str_idx;
2928	if (next_nodes.nelem)
2929	{
2930	err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2931	if (__glibc_unlikely (err != REG_NOERROR))
2932	{
2933	re_node_set_free (&next_nodes);
2934	return err;
2935	}
2936	err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2937	subexp_num, type);
2938	if (__glibc_unlikely (err != REG_NOERROR))
2939	{
2940	re_node_set_free (&next_nodes);
2941	return err;
2942	}
2943	}
2944	context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2945	cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2946	if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2947	{
2948	re_node_set_free (&next_nodes);
2949	return err;
2950	}
2951	mctx->state_log[str_idx] = cur_state;
2952	null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2953	}
2954	re_node_set_free (&next_nodes);
2955	cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2956	: &mctx->state_log[last_str]->nodes);
2957	path->next_idx = str_idx;
2958
2959	/* Fix MCTX. */
2960	mctx->state_log = backup_state_log;
2961	mctx->input.cur_idx = backup_cur_idx;
2962
2963	/* Then check the current node set has the node LAST_NODE. */
2964	if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2965	return REG_NOERROR;
2966
2967	return REG_NOMATCH;
2968	}
2969
2970	/* Helper functions for check_arrival. */
2971
2972	/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2973	to NEXT_NODES.
2974	TODO: This function is similar to the functions transit_state*(),
2975	however this function has many additional works.
2976	Can't we unify them? */
2977
2978	static reg_errcode_t
2979	__attribute_warn_unused_result__
2980	check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2981	re_node_set cur_nodes, re_node_set next_nodes)
2982	{
2983	const re_dfa_t *const dfa = mctx->dfa;
2984	bool ok;
2985	Idx cur_idx;
2986	reg_errcode_t err = REG_NOERROR;
2987	re_node_set union_set;
2988	re_node_set_init_empty (&union_set);
2989	for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2990	{
2991	int naccepted = 0;
2992	Idx cur_node = cur_nodes->elems[cur_idx];
2993	DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2994
2995	/* If the node may accept "multi byte". */
2996	if (dfa->nodes[cur_node].accept_mb)
2997	{
2998	naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2999	str_idx);
3000	if (naccepted > 1)
3001	{
3002	re_dfastate_t *dest_state;
3003	Idx next_node = dfa->nexts[cur_node];
3004	Idx next_idx = str_idx + naccepted;
3005	dest_state = mctx->state_log[next_idx];
3006	re_node_set_empty (&union_set);
3007	if (dest_state)
3008	{
3009	err = re_node_set_merge (&union_set, &dest_state->nodes);
3010	if (__glibc_unlikely (err != REG_NOERROR))
3011	{
3012	re_node_set_free (&union_set);
3013	return err;
3014	}
3015	}
3016	ok = re_node_set_insert (&union_set, next_node);
3017	if (__glibc_unlikely (! ok))
3018	{
3019	re_node_set_free (&union_set);
3020	return REG_ESPACE;
3021	}
3022	mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3023	&union_set);
3024	if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3025	&& err != REG_NOERROR))
3026	{
3027	re_node_set_free (&union_set);
3028	return err;
3029	}
3030	}
3031	}
3032
3033	if (naccepted
3034	\|\| check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3035	{
3036	ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3037	if (__glibc_unlikely (! ok))
3038	{
3039	re_node_set_free (&union_set);
3040	return REG_ESPACE;
3041	}
3042	}
3043	}
3044	re_node_set_free (&union_set);
3045	return REG_NOERROR;
3046	}
3047
3048	/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3049	CUR_NODES, however exclude the nodes which are:
3050	- inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3051	- out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3052	*/
3053
3054	static reg_errcode_t
3055	check_arrival_expand_ecl (const re_dfa_t dfa, re_node_set cur_nodes,
3056	Idx ex_subexp, int type)
3057	{
3058	reg_errcode_t err;
3059	Idx idx, outside_node;
3060	re_node_set new_nodes;
3061	DEBUG_ASSERT (cur_nodes->nelem);
3062	err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3063	if (__glibc_unlikely (err != REG_NOERROR))
3064	return err;
3065	/* Create a new node set NEW_NODES with the nodes which are epsilon
3066	closures of the node in CUR_NODES. */
3067
3068	for (idx = 0; idx < cur_nodes->nelem; ++idx)
3069	{
3070	Idx cur_node = cur_nodes->elems[idx];
3071	const re_node_set *eclosure = dfa->eclosures + cur_node;
3072	outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3073	if (outside_node == -1)
3074	{
3075	/* There are no problematic nodes, just merge them. */
3076	err = re_node_set_merge (&new_nodes, eclosure);
3077	if (__glibc_unlikely (err != REG_NOERROR))
3078	{
3079	re_node_set_free (&new_nodes);
3080	return err;
3081	}
3082	}
3083	else
3084	{
3085	/* There are problematic nodes, re-calculate incrementally. */
3086	err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3087	ex_subexp, type);
3088	if (__glibc_unlikely (err != REG_NOERROR))
3089	{
3090	re_node_set_free (&new_nodes);
3091	return err;
3092	}
3093	}
3094	}
3095	re_node_set_free (cur_nodes);
3096	*cur_nodes = new_nodes;
3097	return REG_NOERROR;
3098	}
3099
3100	/* Helper function for check_arrival_expand_ecl.
3101	Check incrementally the epsilon closure of TARGET, and if it isn't
3102	problematic append it to DST_NODES. */
3103
3104	static reg_errcode_t
3105	__attribute_warn_unused_result__
3106	check_arrival_expand_ecl_sub (const re_dfa_t dfa, re_node_set dst_nodes,
3107	Idx target, Idx ex_subexp, int type)
3108	{
3109	Idx cur_node;
3110	for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3111	{
3112	bool ok;
3113
3114	if (dfa->nodes[cur_node].type == type
3115	&& dfa->nodes[cur_node].opr.idx == ex_subexp)
3116	{
3117	if (type == OP_CLOSE_SUBEXP)
3118	{
3119	ok = re_node_set_insert (dst_nodes, cur_node);
3120	if (__glibc_unlikely (! ok))
3121	return REG_ESPACE;
3122	}
3123	break;
3124	}
3125	ok = re_node_set_insert (dst_nodes, cur_node);
3126	if (__glibc_unlikely (! ok))
3127	return REG_ESPACE;
3128	if (dfa->edests[cur_node].nelem == 0)
3129	break;
3130	if (dfa->edests[cur_node].nelem == 2)
3131	{
3132	reg_errcode_t err;
3133	err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3134	dfa->edests[cur_node].elems[1],
3135	ex_subexp, type);
3136	if (__glibc_unlikely (err != REG_NOERROR))
3137	return err;
3138	}
3139	cur_node = dfa->edests[cur_node].elems[0];
3140	}
3141	return REG_NOERROR;
3142	}
3143
3144
3145	/* For all the back references in the current state, calculate the
3146	destination of the back references by the appropriate entry
3147	in MCTX->BKREF_ENTS. */
3148
3149	static reg_errcode_t
3150	__attribute_warn_unused_result__
3151	expand_bkref_cache (re_match_context_t mctx, re_node_set cur_nodes,
3152	Idx cur_str, Idx subexp_num, int type)
3153	{
3154	const re_dfa_t *const dfa = mctx->dfa;
3155	reg_errcode_t err;
3156	Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3157	struct re_backref_cache_entry *ent;
3158
3159	if (cache_idx_start == -1)
3160	return REG_NOERROR;
3161
3162	restart:
3163	ent = mctx->bkref_ents + cache_idx_start;
3164	do
3165	{
3166	Idx to_idx, next_node;
3167
3168	/* Is this entry ENT is appropriate? */
3169	if (!re_node_set_contains (cur_nodes, ent->node))
3170	continue; /* No. */
3171
3172	to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3173	/* Calculate the destination of the back reference, and append it
3174	to MCTX->STATE_LOG. */
3175	if (to_idx == cur_str)
3176	{
3177	/* The backreference did epsilon transit, we must re-check all the
3178	node in the current state. */
3179	re_node_set new_dests;
3180	reg_errcode_t err2, err3;
3181	next_node = dfa->edests[ent->node].elems[0];
3182	if (re_node_set_contains (cur_nodes, next_node))
3183	continue;
3184	err = re_node_set_init_1 (&new_dests, next_node);
3185	err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3186	err3 = re_node_set_merge (cur_nodes, &new_dests);
3187	re_node_set_free (&new_dests);
3188	if (__glibc_unlikely (err != REG_NOERROR \|\| err2 != REG_NOERROR
3189	\|\| err3 != REG_NOERROR))
3190	{
3191	err = (err != REG_NOERROR ? err
3192	: (err2 != REG_NOERROR ? err2 : err3));
3193	return err;
3194	}
3195	/* TODO: It is still inefficient... */
3196	goto restart;
3197	}
3198	else
3199	{
3200	re_node_set union_set;
3201	next_node = dfa->nexts[ent->node];
3202	if (mctx->state_log[to_idx])
3203	{
3204	bool ok;
3205	if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3206	next_node))
3207	continue;
3208	err = re_node_set_init_copy (&union_set,
3209	&mctx->state_log[to_idx]->nodes);
3210	ok = re_node_set_insert (&union_set, next_node);
3211	if (__glibc_unlikely (err != REG_NOERROR \|\| ! ok))
3212	{
3213	re_node_set_free (&union_set);
3214	err = err != REG_NOERROR ? err : REG_ESPACE;
3215	return err;
3216	}
3217	}
3218	else
3219	{
3220	err = re_node_set_init_1 (&union_set, next_node);
3221	if (__glibc_unlikely (err != REG_NOERROR))
3222	return err;
3223	}
3224	mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3225	re_node_set_free (&union_set);
3226	if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3227	&& err != REG_NOERROR))
3228	return err;
3229	}
3230	}
3231	while (ent++->more);
3232	return REG_NOERROR;
3233	}
3234
3235	/* Build transition table for the state.
3236	Return true if successful. */
3237
3238	static bool __attribute_noinline__
3239	build_trtable (const re_dfa_t dfa, re_dfastate_t state)
3240	{
3241	reg_errcode_t err;
3242	Idx i, j;
3243	int ch;
3244	bool need_word_trtable = false;
3245	bitset_word_t elem, mask;
3246	Idx ndests; /* Number of the destination states from 'state'. */
3247	re_dfastate_t **trtable;
3248	re_dfastate_t *dest_states[SBC_MAX];
3249	re_dfastate_t *dest_states_word[SBC_MAX];
3250	re_dfastate_t *dest_states_nl[SBC_MAX];
3251	re_node_set follows;
3252	bitset_t acceptable;
3253
3254	/* We build DFA states which corresponds to the destination nodes
3255	from 'state'. 'dests_node[i]' represents the nodes which i-th
3256	destination state contains, and 'dests_ch[i]' represents the
3257	characters which i-th destination state accepts. */
3258	re_node_set dests_node[SBC_MAX];
3259	bitset_t dests_ch[SBC_MAX];
3260
3261	/* Initialize transition table. */
3262	state->word_trtable = state->trtable = NULL;
3263
3264	/* At first, group all nodes belonging to 'state' into several
3265	destinations. */
3266	ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3267	if (__glibc_unlikely (ndests <= 0))
3268	{
3269	/* Return false in case of an error, true otherwise. */
3270	if (ndests == 0)
3271	{
3272	state->trtable = (re_dfastate_t **)
3273	calloc (sizeof (re_dfastate_t *), SBC_MAX);
3274	if (__glibc_unlikely (state->trtable == NULL))
3275	return false;
3276	return true;
3277	}
3278	return false;
3279	}
3280
3281	err = re_node_set_alloc (&follows, ndests + 1);
3282	if (__glibc_unlikely (err != REG_NOERROR))
3283	{
3284	out_free:
3285	re_node_set_free (&follows);
3286	for (i = 0; i < ndests; ++i)
3287	re_node_set_free (dests_node + i);
3288	return false;
3289	}
3290
3291	bitset_empty (acceptable);
3292
3293	/* Then build the states for all destinations. */
3294	for (i = 0; i < ndests; ++i)
3295	{
3296	Idx next_node;
3297	re_node_set_empty (&follows);
3298	/* Merge the follows of this destination states. */
3299	for (j = 0; j < dests_node[i].nelem; ++j)
3300	{
3301	next_node = dfa->nexts[dests_node[i].elems[j]];
3302	if (next_node != -1)
3303	{
3304	err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3305	if (__glibc_unlikely (err != REG_NOERROR))
3306	goto out_free;
3307	}
3308	}
3309	dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3310	if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3311	goto out_free;
3312	/* If the new state has context constraint,
3313	build appropriate states for these contexts. */
3314	if (dest_states[i]->has_constraint)
3315	{
3316	dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3317	CONTEXT_WORD);
3318	if (__glibc_unlikely (dest_states_word[i] == NULL
3319	&& err != REG_NOERROR))
3320	goto out_free;
3321
3322	if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3323	need_word_trtable = true;
3324
3325	dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3326	CONTEXT_NEWLINE);
3327	if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3328	goto out_free;
3329	}
3330	else
3331	{
3332	dest_states_word[i] = dest_states[i];
3333	dest_states_nl[i] = dest_states[i];
3334	}
3335	bitset_merge (acceptable, dests_ch[i]);
3336	}
3337
3338	if (!__glibc_unlikely (need_word_trtable))
3339	{
3340	/* We don't care about whether the following character is a word
3341	character, or we are in a single-byte character set so we can
3342	discern by looking at the character code: allocate a
3343	256-entry transition table. */
3344	trtable = state->trtable =
3345	(re_dfastate_t *) calloc (sizeof (re_dfastate_t ), SBC_MAX);
3346	if (__glibc_unlikely (trtable == NULL))
3347	goto out_free;
3348
3349	/* For all characters ch...: */
3350	for (i = 0; i < BITSET_WORDS; ++i)
3351	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3352	elem;
3353	mask <<= 1, elem >>= 1, ++ch)
3354	if (__glibc_unlikely (elem & 1))
3355	{
3356	/* There must be exactly one destination which accepts
3357	character ch. See group_nodes_into_DFAstates. */
3358	for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3359	;
3360
3361	/* j-th destination accepts the word character ch. */
3362	if (dfa->word_char[i] & mask)
3363	trtable[ch] = dest_states_word[j];
3364	else
3365	trtable[ch] = dest_states[j];
3366	}
3367	}
3368	else
3369	{
3370	/* We care about whether the following character is a word
3371	character, and we are in a multi-byte character set: discern
3372	by looking at the character code: build two 256-entry
3373	transition tables, one starting at trtable[0] and one
3374	starting at trtable[SBC_MAX]. */
3375	trtable = state->word_trtable =
3376	(re_dfastate_t *) calloc (sizeof (re_dfastate_t ), 2 * SBC_MAX);
3377	if (__glibc_unlikely (trtable == NULL))
3378	goto out_free;
3379
3380	/* For all characters ch...: */
3381	for (i = 0; i < BITSET_WORDS; ++i)
3382	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3383	elem;
3384	mask <<= 1, elem >>= 1, ++ch)
3385	if (__glibc_unlikely (elem & 1))
3386	{
3387	/* There must be exactly one destination which accepts
3388	character ch. See group_nodes_into_DFAstates. */
3389	for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3390	;
3391
3392	/* j-th destination accepts the word character ch. */
3393	trtable[ch] = dest_states[j];
3394	trtable[ch + SBC_MAX] = dest_states_word[j];
3395	}
3396	}
3397
3398	/* new line */
3399	if (bitset_contain (acceptable, NEWLINE_CHAR))
3400	{
3401	/* The current state accepts newline character. */
3402	for (j = 0; j < ndests; ++j)
3403	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3404	{
3405	/* k-th destination accepts newline character. */
3406	trtable[NEWLINE_CHAR] = dest_states_nl[j];
3407	if (need_word_trtable)
3408	trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3409	/* There must be only one destination which accepts
3410	newline. See group_nodes_into_DFAstates. */
3411	break;
3412	}
3413	}
3414
3415	re_node_set_free (&follows);
3416	for (i = 0; i < ndests; ++i)
3417	re_node_set_free (dests_node + i);
3418	return true;
3419	}
3420
3421	/* Group all nodes belonging to STATE into several destinations.
3422	Then for all destinations, set the nodes belonging to the destination
3423	to DESTS_NODE[i] and set the characters accepted by the destination
3424	to DEST_CH[i]. Return the number of destinations if successful,
3425	-1 on internal error. */
3426
3427	static Idx
3428	group_nodes_into_DFAstates (const re_dfa_t dfa, const re_dfastate_t state,
3429	re_node_set dests_node, bitset_t dests_ch)
3430	{
3431	reg_errcode_t err;
3432	bool ok;
3433	Idx i, j, k;
3434	Idx ndests; /* Number of the destinations from 'state'. */
3435	bitset_t accepts; /* Characters a node can accept. */
3436	const re_node_set *cur_nodes = &state->nodes;
3437	bitset_empty (accepts);
3438	ndests = 0;
3439
3440	/* For all the nodes belonging to 'state', */
3441	for (i = 0; i < cur_nodes->nelem; ++i)
3442	{
3443	re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3444	re_token_type_t type = node->type;
3445	unsigned int constraint = node->constraint;
3446
3447	/* Enumerate all single byte character this node can accept. */
3448	if (type == CHARACTER)
3449	bitset_set (accepts, node->opr.c);
3450	else if (type == SIMPLE_BRACKET)
3451	{
3452	bitset_merge (accepts, node->opr.sbcset);
3453	}
3454	else if (type == OP_PERIOD)
3455	{
3456	if (dfa->mb_cur_max > 1)
3457	bitset_merge (accepts, dfa->sb_char);
3458	else
3459	bitset_set_all (accepts);
3460	if (!(dfa->syntax & RE_DOT_NEWLINE))
3461	bitset_clear (accepts, '\n');
3462	if (dfa->syntax & RE_DOT_NOT_NULL)
3463	bitset_clear (accepts, '\0');
3464	}
3465	else if (type == OP_UTF8_PERIOD)
3466	{
3467	if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3468	memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3469	else
3470	bitset_merge (accepts, utf8_sb_map);
3471	if (!(dfa->syntax & RE_DOT_NEWLINE))
3472	bitset_clear (accepts, '\n');
3473	if (dfa->syntax & RE_DOT_NOT_NULL)
3474	bitset_clear (accepts, '\0');
3475	}
3476	else
3477	continue;
3478
3479	/* Check the 'accepts' and sift the characters which are not
3480	match it the context. */
3481	if (constraint)
3482	{
3483	if (constraint & NEXT_NEWLINE_CONSTRAINT)
3484	{
3485	bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3486	bitset_empty (accepts);
3487	if (accepts_newline)
3488	bitset_set (accepts, NEWLINE_CHAR);
3489	else
3490	continue;
3491	}
3492	if (constraint & NEXT_ENDBUF_CONSTRAINT)
3493	{
3494	bitset_empty (accepts);
3495	continue;
3496	}
3497
3498	if (constraint & NEXT_WORD_CONSTRAINT)
3499	{
3500	bitset_word_t any_set = 0;
3501	if (type == CHARACTER && !node->word_char)
3502	{
3503	bitset_empty (accepts);
3504	continue;
3505	}
3506	if (dfa->mb_cur_max > 1)
3507	for (j = 0; j < BITSET_WORDS; ++j)
3508	any_set \|= (accepts[j] &= (dfa->word_char[j] \| ~dfa->sb_char[j]));
3509	else
3510	for (j = 0; j < BITSET_WORDS; ++j)
3511	any_set \|= (accepts[j] &= dfa->word_char[j]);
3512	if (!any_set)
3513	continue;
3514	}
3515	if (constraint & NEXT_NOTWORD_CONSTRAINT)
3516	{
3517	bitset_word_t any_set = 0;
3518	if (type == CHARACTER && node->word_char)
3519	{
3520	bitset_empty (accepts);
3521	continue;
3522	}
3523	if (dfa->mb_cur_max > 1)
3524	for (j = 0; j < BITSET_WORDS; ++j)
3525	any_set \|= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3526	else
3527	for (j = 0; j < BITSET_WORDS; ++j)
3528	any_set \|= (accepts[j] &= ~dfa->word_char[j]);
3529	if (!any_set)
3530	continue;
3531	}
3532	}
3533
3534	/* Then divide 'accepts' into DFA states, or create a new
3535	state. Above, we make sure that accepts is not empty. */
3536	for (j = 0; j < ndests; ++j)
3537	{
3538	bitset_t intersec; /* Intersection sets, see below. */
3539	bitset_t remains;
3540	/* Flags, see below. */
3541	bitset_word_t has_intersec, not_subset, not_consumed;
3542
3543	/* Optimization, skip if this state doesn't accept the character. */
3544	if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3545	continue;
3546
3547	/* Enumerate the intersection set of this state and 'accepts'. */
3548	has_intersec = 0;
3549	for (k = 0; k < BITSET_WORDS; ++k)
3550	has_intersec \|= intersec[k] = accepts[k] & dests_ch[j][k];
3551	/* And skip if the intersection set is empty. */
3552	if (!has_intersec)
3553	continue;
3554
3555	/* Then check if this state is a subset of 'accepts'. */
3556	not_subset = not_consumed = 0;
3557	for (k = 0; k < BITSET_WORDS; ++k)
3558	{
3559	not_subset \|= remains[k] = ~accepts[k] & dests_ch[j][k];
3560	not_consumed \|= accepts[k] = accepts[k] & ~dests_ch[j][k];
3561	}
3562
3563	/* If this state isn't a subset of 'accepts', create a
3564	new group state, which has the 'remains'. */
3565	if (not_subset)
3566	{
3567	bitset_copy (dests_ch[ndests], remains);
3568	bitset_copy (dests_ch[j], intersec);
3569	err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3570	if (__glibc_unlikely (err != REG_NOERROR))
3571	goto error_return;
3572	++ndests;
3573	}
3574
3575	/* Put the position in the current group. */
3576	ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3577	if (__glibc_unlikely (! ok))
3578	goto error_return;
3579
3580	/* If all characters are consumed, go to next node. */
3581	if (!not_consumed)
3582	break;
3583	}
3584	/* Some characters remain, create a new group. */
3585	if (j == ndests)
3586	{
3587	bitset_copy (dests_ch[ndests], accepts);
3588	err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3589	if (__glibc_unlikely (err != REG_NOERROR))
3590	goto error_return;
3591	++ndests;
3592	bitset_empty (accepts);
3593	}
3594	}
3595	assume (ndests <= SBC_MAX);
3596	return ndests;
3597	error_return:
3598	for (j = 0; j < ndests; ++j)
3599	re_node_set_free (dests_node + j);
3600	return -1;
3601	}
3602
3603	/* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3604	Return the number of the bytes the node accepts.
3605	STR_IDX is the current index of the input string.
3606
3607	This function handles the nodes which can accept one character, or
3608	one collating element like '.', '[a-z]', opposite to the other nodes
3609	can only accept one byte. */
3610
3611	#ifdef _LIBC
3612	# include <locale/weight.h>
3613	#endif
3614
3615	static int
3616	check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3617	const re_string_t *input, Idx str_idx)
3618	{
3619	const re_token_t *node = dfa->nodes + node_idx;
3620	int char_len, elem_len;
3621	Idx i;
3622
3623	if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3624	{
3625	unsigned char c = re_string_byte_at (input, str_idx), d;
3626	if (__glibc_likely (c < 0xc2))
3627	return 0;
3628
3629	if (str_idx + 2 > input->len)
3630	return 0;
3631
3632	d = re_string_byte_at (input, str_idx + 1);
3633	if (c < 0xe0)
3634	return (d < 0x80 \|\| d > 0xbf) ? 0 : 2;
3635	else if (c < 0xf0)
3636	{
3637	char_len = 3;
3638	if (c == 0xe0 && d < 0xa0)
3639	return 0;
3640	}
3641	else if (c < 0xf8)
3642	{
3643	char_len = 4;
3644	if (c == 0xf0 && d < 0x90)
3645	return 0;
3646	}
3647	else if (c < 0xfc)
3648	{
3649	char_len = 5;
3650	if (c == 0xf8 && d < 0x88)
3651	return 0;
3652	}
3653	else if (c < 0xfe)
3654	{
3655	char_len = 6;
3656	if (c == 0xfc && d < 0x84)
3657	return 0;
3658	}
3659	else
3660	return 0;
3661
3662	if (str_idx + char_len > input->len)
3663	return 0;
3664
3665	for (i = 1; i < char_len; ++i)
3666	{
3667	d = re_string_byte_at (input, str_idx + i);
3668	if (d < 0x80 \|\| d > 0xbf)
3669	return 0;
3670	}
3671	return char_len;
3672	}
3673
3674	char_len = re_string_char_size_at (input, str_idx);
3675	if (node->type == OP_PERIOD)
3676	{
3677	if (char_len <= 1)
3678	return 0;
3679	/* FIXME: I don't think this if is needed, as both '\n'
3680	and '\0' are char_len == 1. */
3681	/* '.' accepts any one character except the following two cases. */
3682	if ((!(dfa->syntax & RE_DOT_NEWLINE)
3683	&& re_string_byte_at (input, str_idx) == '\n')
3684	\|\| ((dfa->syntax & RE_DOT_NOT_NULL)
3685	&& re_string_byte_at (input, str_idx) == '\0'))
3686	return 0;
3687	return char_len;
3688	}
3689
3690	elem_len = re_string_elem_size_at (input, str_idx);
3691	if ((elem_len <= 1 && char_len <= 1) \|\| char_len == 0)
3692	return 0;
3693
3694	if (node->type == COMPLEX_BRACKET)
3695	{
3696	const re_charset_t *cset = node->opr.mbcset;
3697	#ifdef _LIBC
3698	const unsigned char *pin
3699	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3700	Idx j;
3701	uint32_t nrules;
3702	#endif
3703	int match_len = 0;
3704	wchar_t wc = ((cset->nranges \|\| cset->nchar_classes \|\| cset->nmbchars)
3705	? re_string_wchar_at (input, str_idx) : 0);
3706
3707	/* match with multibyte character? */
3708	for (i = 0; i < cset->nmbchars; ++i)
3709	if (wc == cset->mbchars[i])
3710	{
3711	match_len = char_len;
3712	goto check_node_accept_bytes_match;
3713	}
3714	/* match with character_class? */
3715	for (i = 0; i < cset->nchar_classes; ++i)
3716	{
3717	wctype_t wt = cset->char_classes[i];
3718	if (__iswctype (wc, wt))
3719	{
3720	match_len = char_len;
3721	goto check_node_accept_bytes_match;
3722	}
3723	}
3724
3725	#ifdef _LIBC
3726	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3727	if (nrules != 0)
3728	{
3729	unsigned int in_collseq = 0;
3730	const int32_t table, indirect;
3731	const unsigned char weights, extra;
3732	const char *collseqwc;
3733
3734	/* match with collating_symbol? */
3735	if (cset->ncoll_syms)
3736	extra = (const unsigned char *)
3737	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3738	for (i = 0; i < cset->ncoll_syms; ++i)
3739	{
3740	const unsigned char *coll_sym = extra + cset->coll_syms[i];
3741	/* Compare the length of input collating element and
3742	the length of current collating element. */
3743	if (*coll_sym != elem_len)
3744	continue;
3745	/* Compare each bytes. */
3746	for (j = 0; j < *coll_sym; j++)
3747	if (pin[j] != coll_sym[1 + j])
3748	break;
3749	if (j == *coll_sym)
3750	{
3751	/* Match if every bytes is equal. */
3752	match_len = j;
3753	goto check_node_accept_bytes_match;
3754	}
3755	}
3756
3757	if (cset->nranges)
3758	{
3759	if (elem_len <= char_len)
3760	{
3761	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3762	in_collseq = __collseq_table_lookup (collseqwc, wc);
3763	}
3764	else
3765	in_collseq = find_collation_sequence_value (pin, elem_len);
3766	}
3767	/* match with range expression? */
3768	/* FIXME: Implement rational ranges here, too. */
3769	for (i = 0; i < cset->nranges; ++i)
3770	if (cset->range_starts[i] <= in_collseq
3771	&& in_collseq <= cset->range_ends[i])
3772	{
3773	match_len = elem_len;
3774	goto check_node_accept_bytes_match;
3775	}
3776
3777	/* match with equivalence_class? */
3778	if (cset->nequiv_classes)
3779	{
3780	const unsigned char *cp = pin;
3781	table = (const int32_t *)
3782	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3783	weights = (const unsigned char *)
3784	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3785	extra = (const unsigned char *)
3786	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3787	indirect = (const int32_t *)
3788	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3789	int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3790	int32_t rule = idx >> 24;
3791	idx &= 0xffffff;
3792	if (idx > 0)
3793	{
3794	size_t weight_len = weights[idx];
3795	for (i = 0; i < cset->nequiv_classes; ++i)
3796	{
3797	int32_t equiv_class_idx = cset->equiv_classes[i];
3798	int32_t equiv_class_rule = equiv_class_idx >> 24;
3799	equiv_class_idx &= 0xffffff;
3800	if (weights[equiv_class_idx] == weight_len
3801	&& equiv_class_rule == rule
3802	&& memcmp (weights + idx + 1,
3803	weights + equiv_class_idx + 1,
3804	weight_len) == 0)
3805	{
3806	match_len = elem_len;
3807	goto check_node_accept_bytes_match;
3808	}
3809	}
3810	}
3811	}
3812	}
3813	else
3814	#endif /* _LIBC */
3815	{
3816	/* match with range expression? */
3817	for (i = 0; i < cset->nranges; ++i)
3818	{
3819	if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3820	{
3821	match_len = char_len;
3822	goto check_node_accept_bytes_match;
3823	}
3824	}
3825	}
3826	check_node_accept_bytes_match:
3827	if (!cset->non_match)
3828	return match_len;
3829	else
3830	{
3831	if (match_len > 0)
3832	return 0;
3833	else
3834	return (elem_len > char_len) ? elem_len : char_len;
3835	}
3836	}
3837	return 0;
3838	}
3839
3840	#ifdef _LIBC
3841	static unsigned int
3842	find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3843	{
3844	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3845	if (nrules == 0)
3846	{
3847	if (mbs_len == 1)
3848	{
3849	/* No valid character. Match it as a single byte character. */
3850	const unsigned char collseq = (const unsigned char )
3851	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3852	return collseq[mbs[0]];
3853	}
3854	return UINT_MAX;
3855	}
3856	else
3857	{
3858	int32_t idx;
3859	const unsigned char extra = (const unsigned char )
3860	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3861	int32_t extrasize = (const unsigned char *)
3862	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3863
3864	for (idx = 0; idx < extrasize;)
3865	{
3866	int mbs_cnt;
3867	bool found = false;
3868	int32_t elem_mbs_len;
3869	/* Skip the name of collating element name. */
3870	idx = idx + extra[idx] + 1;
3871	elem_mbs_len = extra[idx++];
3872	if (mbs_len == elem_mbs_len)
3873	{
3874	for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3875	if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3876	break;
3877	if (mbs_cnt == elem_mbs_len)
3878	/* Found the entry. */
3879	found = true;
3880	}
3881	/* Skip the byte sequence of the collating element. */
3882	idx += elem_mbs_len;
3883	/* Adjust for the alignment. */
3884	idx = (idx + 3) & ~3;
3885	/* Skip the collation sequence value. */
3886	idx += sizeof (uint32_t);
3887	/* Skip the wide char sequence of the collating element. */
3888	idx = idx + sizeof (uint32_t) * ((int32_t ) (extra + idx) + 1);
3889	/* If we found the entry, return the sequence value. */
3890	if (found)
3891	return (uint32_t ) (extra + idx);
3892	/* Skip the collation sequence value. */
3893	idx += sizeof (uint32_t);
3894	}
3895	return UINT_MAX;
3896	}
3897	}
3898	#endif /* _LIBC */
3899
3900	/* Check whether the node accepts the byte which is IDX-th
3901	byte of the INPUT. */
3902
3903	static bool
3904	check_node_accept (const re_match_context_t mctx, const re_token_t node,
3905	Idx idx)
3906	{
3907	unsigned char ch;
3908	ch = re_string_byte_at (&mctx->input, idx);
3909	switch (node->type)
3910	{
3911	case CHARACTER:
3912	if (node->opr.c != ch)
3913	return false;
3914	break;
3915
3916	case SIMPLE_BRACKET:
3917	if (!bitset_contain (node->opr.sbcset, ch))
3918	return false;
3919	break;
3920
3921	case OP_UTF8_PERIOD:
3922	if (ch >= ASCII_CHARS)
3923	return false;
3924	FALLTHROUGH;
3925	case OP_PERIOD:
3926	if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3927	\|\| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3928	return false;
3929	break;
3930
3931	default:
3932	return false;
3933	}
3934
3935	if (node->constraint)
3936	{
3937	/* The node has constraints. Check whether the current context
3938	satisfies the constraints. */
3939	unsigned int context = re_string_context_at (&mctx->input, idx,
3940	mctx->eflags);
3941	if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3942	return false;
3943	}
3944
3945	return true;
3946	}
3947
3948	/* Extend the buffers, if the buffers have run out. */
3949
3950	static reg_errcode_t
3951	__attribute_warn_unused_result__
3952	extend_buffers (re_match_context_t *mctx, int min_len)
3953	{
3954	reg_errcode_t ret;
3955	re_string_t *pstr = &mctx->input;
3956
3957	/* Avoid overflow. */
3958	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3959	<= pstr->bufs_len))
3960	return REG_ESPACE;
3961
3962	/* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3963	ret = re_string_realloc_buffers (pstr,
3964	MAX (min_len,
3965	MIN (pstr->len, pstr->bufs_len * 2)));
3966	if (__glibc_unlikely (ret != REG_NOERROR))
3967	return ret;
3968
3969	if (mctx->state_log != NULL)
3970	{
3971	/* And double the length of state_log. */
3972	/* XXX We have no indication of the size of this buffer. If this
3973	allocation fail we have no indication that the state_log array
3974	does not have the right size. */
3975	re_dfastate_t *new_array = re_realloc (mctx->state_log, re_dfastate_t ,
3976	pstr->bufs_len + 1);
3977	if (__glibc_unlikely (new_array == NULL))
3978	return REG_ESPACE;
3979	mctx->state_log = new_array;
3980	}
3981
3982	/* Then reconstruct the buffers. */
3983	if (pstr->icase)
3984	{
3985	if (pstr->mb_cur_max > 1)
3986	{
3987	ret = build_wcs_upper_buffer (pstr);
3988	if (__glibc_unlikely (ret != REG_NOERROR))
3989	return ret;
3990	}
3991	else
3992	build_upper_buffer (pstr);
3993	}
3994	else
3995	{
3996	if (pstr->mb_cur_max > 1)
3997	build_wcs_buffer (pstr);
3998	else
3999	{
4000	if (pstr->trans != NULL)
4001	re_string_translate_buffer (pstr);
4002	}
4003	}
4004	return REG_NOERROR;
4005	}
4006
4007
4008
4009	/* Functions for matching context. */
4010
4011	/* Initialize MCTX. */
4012
4013	static reg_errcode_t
4014	__attribute_warn_unused_result__
4015	match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4016	{
4017	mctx->eflags = eflags;
4018	mctx->match_last = -1;
4019	if (n > 0)
4020	{
4021	/* Avoid overflow. */
4022	size_t max_object_size =
4023	MAX (sizeof (struct re_backref_cache_entry),
4024	sizeof (re_sub_match_top_t *));
4025	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4026	return REG_ESPACE;
4027
4028	mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4029	mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4030	if (__glibc_unlikely (mctx->bkref_ents == NULL \|\| mctx->sub_tops == NULL))
4031	return REG_ESPACE;
4032	}
4033	/* Already zero-ed by the caller.
4034	else
4035	mctx->bkref_ents = NULL;
4036	mctx->nbkref_ents = 0;
4037	mctx->nsub_tops = 0; */
4038	mctx->abkref_ents = n;
4039	mctx->max_mb_elem_len = 1;
4040	mctx->asub_tops = n;
4041	return REG_NOERROR;
4042	}
4043
4044	/* Clean the entries which depend on the current input in MCTX.
4045	This function must be invoked when the matcher changes the start index
4046	of the input, or changes the input string. */
4047
4048	static void
4049	match_ctx_clean (re_match_context_t *mctx)
4050	{
4051	Idx st_idx;
4052	for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4053	{
4054	Idx sl_idx;
4055	re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4056	for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4057	{
4058	re_sub_match_last_t *last = top->lasts[sl_idx];
4059	re_free (last->path.array);
4060	re_free (last);
4061	}
4062	re_free (top->lasts);
4063	if (top->path)
4064	{
4065	re_free (top->path->array);
4066	re_free (top->path);
4067	}
4068	re_free (top);
4069	}
4070
4071	mctx->nsub_tops = 0;
4072	mctx->nbkref_ents = 0;
4073	}
4074
4075	/* Free all the memory associated with MCTX. */
4076
4077	static void
4078	match_ctx_free (re_match_context_t *mctx)
4079	{
4080	/* First, free all the memory associated with MCTX->SUB_TOPS. */
4081	match_ctx_clean (mctx);
4082	re_free (mctx->sub_tops);
4083	re_free (mctx->bkref_ents);
4084	}
4085
4086	/* Add a new backreference entry to MCTX.
4087	Note that we assume that caller never call this function with duplicate
4088	entry, and call with STR_IDX which isn't smaller than any existing entry.
4089	*/
4090
4091	static reg_errcode_t
4092	__attribute_warn_unused_result__
4093	match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4094	Idx to)
4095	{
4096	if (mctx->nbkref_ents >= mctx->abkref_ents)
4097	{
4098	struct re_backref_cache_entry* new_entry;
4099	new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4100	mctx->abkref_ents * 2);
4101	if (__glibc_unlikely (new_entry == NULL))
4102	{
4103	re_free (mctx->bkref_ents);
4104	return REG_ESPACE;
4105	}
4106	mctx->bkref_ents = new_entry;
4107	memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4108	sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4109	mctx->abkref_ents *= 2;
4110	}
4111	if (mctx->nbkref_ents > 0
4112	&& mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4113	mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4114
4115	mctx->bkref_ents[mctx->nbkref_ents].node = node;
4116	mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4117	mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4118	mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4119
4120	/* This is a cache that saves negative results of check_dst_limits_calc_pos.
4121	If bit N is clear, means that this entry won't epsilon-transition to
4122	an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4123	it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4124	such node.
4125
4126	A backreference does not epsilon-transition unless it is empty, so set
4127	to all zeros if FROM != TO. */
4128	mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4129	= (from == to ? -1 : 0);
4130
4131	mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4132	if (mctx->max_mb_elem_len < to - from)
4133	mctx->max_mb_elem_len = to - from;
4134	return REG_NOERROR;
4135	}
4136
4137	/* Return the first entry with the same str_idx, or -1 if none is
4138	found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4139
4140	static Idx
4141	search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4142	{
4143	Idx left, right, mid, last;
4144	last = right = mctx->nbkref_ents;
4145	for (left = 0; left < right;)
4146	{
4147	mid = (left + right) / 2;
4148	if (mctx->bkref_ents[mid].str_idx < str_idx)
4149	left = mid + 1;
4150	else
4151	right = mid;
4152	}
4153	if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4154	return left;
4155	else
4156	return -1;
4157	}
4158
4159	/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4160	at STR_IDX. */
4161
4162	static reg_errcode_t
4163	__attribute_warn_unused_result__
4164	match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4165	{
4166	DEBUG_ASSERT (mctx->sub_tops != NULL);
4167	DEBUG_ASSERT (mctx->asub_tops > 0);
4168	if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4169	{
4170	Idx new_asub_tops = mctx->asub_tops * 2;
4171	re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4172	re_sub_match_top_t *,
4173	new_asub_tops);
4174	if (__glibc_unlikely (new_array == NULL))
4175	return REG_ESPACE;
4176	mctx->sub_tops = new_array;
4177	mctx->asub_tops = new_asub_tops;
4178	}
4179	mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4180	if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4181	return REG_ESPACE;
4182	mctx->sub_tops[mctx->nsub_tops]->node = node;
4183	mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4184	return REG_NOERROR;
4185	}
4186
4187	/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4188	at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4189	Return the new entry if successful, NULL if memory is exhausted. */
4190
4191	static re_sub_match_last_t *
4192	match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4193	{
4194	re_sub_match_last_t *new_entry;
4195	if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4196	{
4197	Idx new_alasts = 2 * subtop->alasts + 1;
4198	re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4199	re_sub_match_last_t *,
4200	new_alasts);
4201	if (__glibc_unlikely (new_array == NULL))
4202	return NULL;
4203	subtop->lasts = new_array;
4204	subtop->alasts = new_alasts;
4205	}
4206	new_entry = calloc (1, sizeof (re_sub_match_last_t));
4207	if (__glibc_likely (new_entry != NULL))
4208	{
4209	subtop->lasts[subtop->nlasts] = new_entry;
4210	new_entry->node = node;
4211	new_entry->str_idx = str_idx;
4212	++subtop->nlasts;
4213	}
4214	return new_entry;
4215	}
4216
4217	static void
4218	sift_ctx_init (re_sift_context_t sctx, re_dfastate_t *sifted_sts,
4219	re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4220	{
4221	sctx->sifted_states = sifted_sts;
4222	sctx->limited_states = limited_sts;
4223	sctx->last_node = last_node;
4224	sctx->last_str_idx = last_str_idx;
4225	re_node_set_init_empty (&sctx->limits);
4226	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/trunk/src/sed/lib/regexec.c@ 3613

Download in other formats: