regex.c
DEFINITIONS
This source file includes following functions.
- PARAMS
- PARAMS
- _
- _
- init_syntax_once
- re_set_casetable
- ISASCII
- ISASCII
- ISBLANK
- ISBLANK
- ISGRAPH
- ISGRAPH
- SIGN_EXTEND_CHAR
- SIGN_EXTEND_CHAR
- re_set_syntax
- utf8_firstbyte
- print_mbc
- set_list_bits
- is_in_list
- print_partial_compiled_pattern
- print_compiled_pattern
- calculate_must_string
- read_backslash
- read_special
- re_compile_pattern
- re_free_pattern
- store_jump
- insert_jump
- store_jump_n
- insert_jump_n
- insert_op
- insert_op_2
- slow_match
- slow_search
- bm_init_skip
- bm_search
- re_compile_fastmap
- re_adjust_startpos
- re_search
- init_regs
- re_match
- memcmp_translate
- re_copy_registers
- re_free_registers
- re_mbcinit
- asc_startpos
- euc_startpos
- sjis_startpos
- utf8_startpos
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 #include "config.h"
25
26 #ifdef HAVE_STRING_H
27 # include <string.h>
28 #else
29 # include <strings.h>
30 #endif
31
32
33 #include <stdio.h>
34
35
36 #include <ctype.h>
37 #include <sys/types.h>
38
39 #ifndef PARAMS
40 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
41 # define PARAMS(args) args
42 # else
43 # define PARAMS(args) ()
44 # endif
45 #endif
46
47 #if defined(STDC_HEADERS)
48 # include <stddef.h>
49 #else
50
51 # include <sys/types.h>
52 #endif
53
54 #if !defined(__STDC__) && !defined(_MSC_VER)
55 # define volatile
56 #endif
57
58 #ifdef HAVE_PROTOTYPES
59 # define _(args) args
60 #else
61 # define _(args) ()
62 #endif
63
64 #ifdef RUBY_PLATFORM
65 #include "defines.h"
66
67 # define RUBY
68 extern int rb_prohibit_interrupt;
69 extern int rb_trap_pending;
70 void rb_trap_exec _((void));
71
72 # define CHECK_INTS do {\
73 if (!rb_prohibit_interrupt) {\
74 if (rb_trap_pending) rb_trap_exec();\
75 }\
76 } while (0)
77 #endif
78
79
80 #ifdef __GNUC__
81 # ifndef atarist
82 # ifndef alloca
83 # define alloca __builtin_alloca
84 # endif
85 # endif
86 #else
87 # if defined(HAVE_ALLOCA_H)
88 # include <alloca.h>
89 # elif !defined(alloca)
90 char *alloca();
91 # endif
92 #endif
93
94 #ifdef _AIX
95 #pragma alloca
96 #endif
97
98 #ifdef HAVE_STRING_H
99 # include <string.h>
100 #else
101 # include <strings.h>
102 #endif
103
104 #ifdef C_ALLOCA
105 #define FREE_VARIABLES() alloca(0)
106 #else
107 #define FREE_VARIABLES()
108 #endif
109
110 #define FREE_AND_RETURN_VOID(stackb) do { \
111 FREE_VARIABLES(); \
112 if (stackb != stacka) xfree(stackb); \
113 return; \
114 } while(0)
115
116 #define FREE_AND_RETURN(stackb,val) do { \
117 FREE_VARIABLES(); \
118 if (stackb != stacka) xfree(stackb); \
119 return(val); \
120 } while(0)
121
122 #define DOUBLE_STACK(type) do { \
123 type *stackx; \
124 unsigned int xlen = stacke - stackb; \
125 if (stackb == stacka) { \
126 stackx = (type*)xmalloc(2 * xlen * sizeof(type)); \
127 memcpy(stackx, stackb, xlen * sizeof (type)); \
128 } \
129 else { \
130 stackx = (type*)xrealloc(stackb, 2 * xlen * sizeof(type)); \
131 } \
132 \
133 stackp = stackx + (stackp - stackb); \
134 stackb = stackx; \
135 stacke = stackb + 2 * xlen; \
136 } while (0)
137
138 #define RE_TALLOC(n,t) ((t*)alloca((n)*sizeof(t)))
139 #define TMALLOC(n,t) ((t*)xmalloc((n)*sizeof(t)))
140 #define TREALLOC(s,n,t) (s=((t*)xrealloc(s,(n)*sizeof(t))))
141
142 #define EXPAND_FAIL_STACK() DOUBLE_STACK(unsigned char*)
143 #define ENSURE_FAIL_STACK(n) \
144 do { \
145 if (stacke - stackp <= (n)) { \
146
147
148
149 \
150 \
151 \
152 EXPAND_FAIL_STACK(); \
153 } \
154 } while (0)
155
156
157 #include "regex.h"
158
159
160 static void store_jump _((char*, int, char*));
161 static void insert_jump _((int, char*, char*, char*));
162 static void store_jump_n _((char*, int, char*, unsigned));
163 static void insert_jump_n _((int, char*, char*, char*, unsigned));
164 static void insert_op _((int, char*, char*));
165 static void insert_op_2 _((int, char*, char*, int, int));
166 static int memcmp_translate _((unsigned char*, unsigned char*, int));
167
168
169
170
171
172 #define Sword 1
173 #define Sword2 2
174
175 #define SYNTAX(c) re_syntax_table[c]
176
177 static char re_syntax_table[256];
178 static void init_syntax_once _((void));
179 static const unsigned char *translate = 0;
180 static void init_regs _((struct re_registers*, unsigned int));
181 static void bm_init_skip _((int *, unsigned char*, int, const unsigned char*));
182 static int current_mbctype = MBCTYPE_ASCII;
183
184 #undef P
185
186 #ifdef RUBY
187 #include "util.h"
188 #endif
189
190 static void
191 init_syntax_once()
192 {
193 register int c;
194 static int done = 0;
195
196 if (done)
197 return;
198
199 memset(re_syntax_table, 0, sizeof re_syntax_table);
200
201 for (c=0; c<=0x7f; c++)
202 if (isalnum(c))
203 re_syntax_table[c] = Sword;
204 re_syntax_table['_'] = Sword;
205
206 for (c=0x80; c<=0xff; c++)
207 if (isalnum(c))
208 re_syntax_table[c] = Sword2;
209 done = 1;
210 }
211
212 void
213 re_set_casetable(table)
214 const char *table;
215 {
216 translate = (const unsigned char*)table;
217 }
218
219
220
221
222
223
224
225
226
227
228
229
230
231 #undef ISASCII
232 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
233 # define ISASCII(c) 1
234 #else
235 # define ISASCII(c) isascii(c)
236 #endif
237
238 #ifdef isblank
239 # define ISBLANK(c) (ISASCII(c) && isblank(c))
240 #else
241 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
242 #endif
243 #ifdef isgraph
244 # define ISGRAPH(c) (ISASCII(c) && isgraph(c))
245 #else
246 # define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))
247 #endif
248
249 #undef ISPRINT
250 #define ISPRINT(c) (ISASCII(c) && isprint(c))
251 #define ISDIGIT(c) (ISASCII(c) && isdigit(c))
252 #define ISALNUM(c) (ISASCII(c) && isalnum(c))
253 #define ISALPHA(c) (ISASCII(c) && isalpha(c))
254 #define ISCNTRL(c) (ISASCII(c) && iscntrl(c))
255 #define ISLOWER(c) (ISASCII(c) && islower(c))
256 #define ISPUNCT(c) (ISASCII(c) && ispunct(c))
257 #define ISSPACE(c) (ISASCII(c) && isspace(c))
258 #define ISUPPER(c) (ISASCII(c) && isupper(c))
259 #define ISXDIGIT(c) (ISASCII(c) && isxdigit(c))
260
261 #ifndef NULL
262 # define NULL (void *)0
263 #endif
264
265
266
267
268
269 #undef SIGN_EXTEND_CHAR
270 #if __STDC__
271 # define SIGN_EXTEND_CHAR(c) ((signed char)(c))
272 #else
273
274 # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)
275 #endif
276
277
278
279
280
281
282
283
284
285
286
287 enum regexpcode
288 {
289 unused=0,
290 exactn=1,
291 begline,
292 endline,
293 begbuf,
294
295 endbuf,
296 endbuf2,
297 begpos,
298 jump,
299 jump_past_alt,
300 on_failure_jump,
301
302 finalize_jump,
303
304 maybe_finalize_jump,
305
306
307
308
309
310
311
312 dummy_failure_jump,
313
314
315
316
317
318 push_dummy_failure,
319
320 succeed_n,
321
322
323
324 jump_n,
325
326
327 try_next,
328
329 finalize_push,
330
331 finalize_push_n,
332 set_number_at,
333
334 anychar,
335 anychar_repeat,
336 charset,
337
338
339
340
341
342
343 charset_not,
344
345 start_memory,
346
347
348
349 stop_memory,
350
351
352
353 start_paren,
354 stop_paren,
355 casefold_on,
356 casefold_off,
357 option_set,
358 start_nowidth,
359 stop_nowidth,
360 pop_and_fail,
361 stop_backtrack,
362 duplicate,
363
364
365 wordchar,
366 notwordchar,
367 wordbeg,
368 wordend,
369 wordbound,
370 notwordbound
371 };
372
373
374
375
376
377
378 #ifndef NFAILURES
379 #define NFAILURES 160
380 #endif
381
382
383 #define STORE_NUMBER(destination, number) \
384 do { (destination)[0] = (number) & 0377; \
385 (destination)[1] = (number) >> 8; } while (0)
386
387
388
389
390 #define STORE_NUMBER_AND_INCR(destination, number) \
391 do { STORE_NUMBER(destination, number); \
392 (destination) += 2; } while (0)
393
394
395
396
397 #define EXTRACT_NUMBER(destination, source) \
398 do { (destination) = *(source) & 0377; \
399 (destination) += SIGN_EXTEND_CHAR(*(char*)((source) + 1)) << 8; } while (0)
400
401
402
403
404 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
405 do { EXTRACT_NUMBER(destination, source); \
406 (source) += 2; } while (0)
407
408
409
410
411
412
413
414
415
416 long
417 re_set_syntax(syntax)
418 long syntax;
419 {
420
421 return 0;
422 }
423
424
425
426 #define TRANSLATE_P() ((options&RE_OPTION_IGNORECASE) && translate)
427 #define MAY_TRANSLATE() ((bufp->options&(RE_OPTION_IGNORECASE|RE_MAY_IGNORECASE)) && translate)
428
429
430
431
432 #define PATFETCH(c) \
433 do {if (p == pend) goto end_of_pattern; \
434 c = (unsigned char) *p++; \
435 if (TRANSLATE_P()) c = (unsigned char)translate[c]; \
436 } while (0)
437
438
439
440 #define PATFETCH_RAW(c) \
441 do {if (p == pend) goto end_of_pattern; \
442 c = (unsigned char)*p++; \
443 } while (0)
444
445
446 #define PATUNFETCH p--
447
448 #define MBC2WC(c, p) \
449 do { \
450 if (current_mbctype == MBCTYPE_UTF8) { \
451 int n = mbclen(c) - 1; \
452 c &= (1<<(BYTEWIDTH-2-n)) - 1; \
453 while (n--) { \
454 c = c << 6 | (*p++ & ((1<<6)-1)); \
455 } \
456 } \
457 else { \
458 c <<= 8; \
459 c |= (unsigned char)*(p)++; \
460 } \
461 } while (0)
462
463 #define PATFETCH_MBC(c) \
464 do { \
465 if (p + mbclen(c) - 1 >= pend) goto end_of_pattern; \
466 MBC2WC(c, p); \
467 } while(0)
468
469 #define WC2MBC1ST(c) \
470 ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c))
471
472 typedef unsigned int (*mbc_startpos_func_t) _((const char *string, unsigned int pos));
473
474 static unsigned int asc_startpos _((const char *string, unsigned int pos));
475 static unsigned int euc_startpos _((const char *string, unsigned int pos));
476 static unsigned int sjis_startpos _((const char *string, unsigned int pos));
477 static unsigned int utf8_startpos _((const char *string, unsigned int pos));
478
479 static const mbc_startpos_func_t mbc_startpos_func[4] = {
480 asc_startpos, euc_startpos, sjis_startpos, utf8_startpos
481 };
482
483 #define mbc_startpos(start, pos) (*mbc_startpos_func[current_mbctype])((start), (pos))
484
485 static unsigned int
486 utf8_firstbyte(c)
487 unsigned long c;
488 {
489 if (c < 0x80) return c;
490 if (c <= 0x7ff) return ((c>>6)&0xff)|0xc0;
491 if (c <= 0xffff) return ((c>>12)&0xff)|0xe0;
492 if (c <= 0x1fffff) return ((c>>18)&0xff)|0xf0;
493 if (c <= 0x3ffffff) return ((c>>24)&0xff)|0xf8;
494 if (c <= 0x7fffffff) return ((c>>30)&0xff)|0xfc;
495 #if SIZEOF_INT > 4
496 if (c <= 0xfffffffff) return 0xfe;
497 #else
498 return 0xfe;
499 #endif
500 }
501
502 static void
503 print_mbc(c)
504 unsigned int c;
505 {
506 if (current_mbctype == MBCTYPE_UTF8) {
507 if (c < 0x80)
508 printf("%c", (int)c);
509 else if (c <= 0x7ff)
510 printf("%c%c", (int)utf8_firstbyte(c), (int)(c & 0x3f));
511 else if (c <= 0xffff)
512 printf("%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 6) & 0x3f),
513 (int)(c & 0x3f));
514 else if (c <= 0x1fffff)
515 printf("%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 12) & 0x3f),
516 (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
517 else if (c <= 0x3ffffff)
518 printf("%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 18) & 0x3f),
519 (int)((c >> 12) & 0x3f), (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
520 else if (c <= 0x7fffffff)
521 printf("%c%c%c%c%c%c", (int)utf8_firstbyte(c), (int)((c >> 24) & 0x3f),
522 (int)((c >> 18) & 0x3f), (int)((c >> 12) & 0x3f),
523 (int)((c >> 6) & 0x3f), (int)(c & 0x3f));
524 }
525 else if (c < 0xff) {
526 printf("\\%o", (int)c);
527 }
528 else {
529 printf("%c%c", (int)(c >> BYTEWIDTH), (int)(c &0xff));
530 }
531 }
532
533
534 #define INIT_BUF_SIZE 28
535
536
537 #define GET_BUFFER_SPACE(n) \
538 do { \
539 while (b - bufp->buffer + (n) >= bufp->allocated) \
540 EXTEND_BUFFER; \
541 } while (0)
542
543
544 #define BUFPUSH(ch) \
545 do { \
546 GET_BUFFER_SPACE(1); \
547 *b++ = (char)(ch); \
548 } while (0)
549
550
551
552
553
554 #define EXTEND_BUFFER \
555 do { char *old_buffer = bufp->buffer; \
556 if (bufp->allocated == (1L<<16)) goto too_big; \
557 bufp->allocated *= 2; \
558 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
559 bufp->buffer = (char*)xrealloc(bufp->buffer, bufp->allocated); \
560 if (bufp->buffer == 0) \
561 goto memory_exhausted; \
562 b = (b - old_buffer) + bufp->buffer; \
563 if (fixup_alt_jump) \
564 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \
565 if (laststart) \
566 laststart = (laststart - old_buffer) + bufp->buffer; \
567 begalt = (begalt - old_buffer) + bufp->buffer; \
568 if (pending_exact) \
569 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
570 } while (0)
571
572
573
574 #define SET_LIST_BIT(c) \
575 (b[(unsigned char)(c) / BYTEWIDTH] \
576 |= 1 << ((unsigned char)(c) % BYTEWIDTH))
577
578
579 #define GET_UNSIGNED_NUMBER(num) \
580 do { if (p != pend) { \
581 PATFETCH(c); \
582 while (ISDIGIT(c)) { \
583 if (num < 0) \
584 num = 0; \
585 num = num * 10 + c - '0'; \
586 if (p == pend) \
587 break; \
588 PATFETCH(c); \
589 } \
590 } \
591 } while (0)
592
593 #define STREQ(s1, s2) ((strcmp(s1, s2) == 0))
594
595 #define CHAR_CLASS_MAX_LENGTH 6
596
597 #define IS_CHAR_CLASS(string) \
598 (STREQ(string, "alpha") || STREQ(string, "upper") \
599 || STREQ(string, "lower") || STREQ(string, "digit") \
600 || STREQ(string, "alnum") || STREQ(string, "xdigit") \
601 || STREQ(string, "space") || STREQ(string, "print") \
602 || STREQ(string, "punct") || STREQ(string, "graph") \
603 || STREQ(string, "cntrl") || STREQ(string, "blank"))
604
605 #define STORE_MBC(p, c) \
606 do { \
607 (p)[0] = (unsigned char)(((c) >>24) & 0xff); \
608 (p)[1] = (unsigned char)(((c) >>16) & 0xff); \
609 (p)[2] = (unsigned char)(((c) >> 8) & 0xff); \
610 (p)[3] = (unsigned char)(((c) >> 0) & 0xff); \
611 } while (0)
612
613 #define STORE_MBC_AND_INCR(p, c) \
614 do { \
615 *(p)++ = (unsigned char)(((c) >>24) & 0xff); \
616 *(p)++ = (unsigned char)(((c) >>16) & 0xff); \
617 *(p)++ = (unsigned char)(((c) >> 8) & 0xff); \
618 *(p)++ = (unsigned char)(((c) >> 0) & 0xff); \
619 } while (0)
620
621 #define EXTRACT_MBC(p) \
622 ((unsigned int)((unsigned char)(p)[0] << 24 | \
623 (unsigned char)(p)[1] << 16 | \
624 (unsigned char)(p)[2] << 8 | \
625 (unsigned char)(p)[3]))
626
627 #define EXTRACT_MBC_AND_INCR(p) \
628 ((unsigned int)((p) += 4, \
629 (unsigned char)(p)[-4] << 24 | \
630 (unsigned char)(p)[-3] << 16 | \
631 (unsigned char)(p)[-2] << 8 | \
632 (unsigned char)(p)[-1]))
633
634 #define EXTRACT_UNSIGNED(p) \
635 ((unsigned char)(p)[0] | (unsigned char)(p)[1] << 8)
636 #define EXTRACT_UNSIGNED_AND_INCR(p) \
637 ((p) += 2, (unsigned char)(p)[-2] | (unsigned char)(p)[-1] << 8)
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654 static void
655 set_list_bits(c1, c2, b)
656 unsigned long c1, c2;
657 unsigned char *b;
658 {
659 unsigned char sbc_size = b[-1];
660 unsigned short mbc_size = EXTRACT_UNSIGNED(&b[sbc_size]);
661 unsigned short beg, end, upb;
662
663 if (c1 > c2)
664 return;
665 b = &b[sbc_size + 2];
666
667 for (beg = 0, upb = mbc_size; beg < upb; ) {
668 unsigned short mid = (unsigned short)(beg + upb) >> 1;
669
670 if ((int)c1 - 1 > (int)EXTRACT_MBC(&b[mid*8+4]))
671 beg = mid + 1;
672 else
673 upb = mid;
674 }
675
676 for (end = beg, upb = mbc_size; end < upb; ) {
677 unsigned short mid = (unsigned short)(end + upb) >> 1;
678
679 if ((int)c2 >= (int)EXTRACT_MBC(&b[mid*8]) - 1)
680 end = mid + 1;
681 else
682 upb = mid;
683 }
684
685 if (beg != end) {
686 if (c1 > EXTRACT_MBC(&b[beg*8]))
687 c1 = EXTRACT_MBC(&b[beg*8]);
688 if (c2 < EXTRACT_MBC(&b[(end - 1)*8+4]))
689 c2 = EXTRACT_MBC(&b[(end - 1)*8+4]);
690 }
691 if (end < mbc_size && end != beg + 1)
692
693 memmove(&b[(beg + 1)*8], &b[end*8], (mbc_size - end)*8);
694 STORE_MBC(&b[beg*8 + 0], c1);
695 STORE_MBC(&b[beg*8 + 4], c2);
696 mbc_size += beg - end + 1;
697 STORE_NUMBER(&b[-2], mbc_size);
698 }
699
700 static int
701 is_in_list(c, b)
702 unsigned long c;
703 const unsigned char *b;
704 {
705 unsigned short size;
706 unsigned short i, j;
707
708 size = *b++;
709 if ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) {
710 return 1;
711 }
712 b += size + 2;
713 size = EXTRACT_UNSIGNED(&b[-2]);
714 if (size == 0) return 0;
715
716 for (i = 0, j = size; i < j; ) {
717 unsigned short k = (unsigned short)(i + j) >> 1;
718
719 if (c > EXTRACT_MBC(&b[k*8+4]))
720 i = k + 1;
721 else
722 j = k;
723 }
724 if (i < size && EXTRACT_MBC(&b[i*8]) <= c)
725 return 1;
726
727 return 0;
728 }
729
730 static void
731 print_partial_compiled_pattern(start, end)
732 unsigned char *start;
733 unsigned char *end;
734 {
735 int mcnt, mcnt2;
736 unsigned char *p = start;
737 unsigned char *pend = end;
738
739 if (start == NULL) {
740 printf("(null)\n");
741 return;
742 }
743
744
745 while (p < pend) {
746 switch ((enum regexpcode)*p++) {
747 case unused:
748 printf("/unused");
749 break;
750
751 case exactn:
752 mcnt = *p++;
753 printf("/exactn/%d", mcnt);
754 do {
755 putchar('/');
756 printf("%c", *p++);
757 }
758 while (--mcnt);
759 break;
760
761 case start_memory:
762 mcnt = *p++;
763 printf("/start_memory/%d/%d", mcnt, *p++);
764 break;
765
766 case stop_memory:
767 mcnt = *p++;
768 printf("/stop_memory/%d/%d", mcnt, *p++);
769 break;
770
771 case start_paren:
772 printf("/start_paren");
773 break;
774
775 case stop_paren:
776 printf("/stop_paren");
777 break;
778
779 case casefold_on:
780 printf("/casefold_on");
781 break;
782
783 case casefold_off:
784 printf("/casefold_off");
785 break;
786
787 case option_set:
788 printf("/option_set/%d", *p++);
789 break;
790
791 case start_nowidth:
792 EXTRACT_NUMBER_AND_INCR(mcnt, p);
793 printf("/start_nowidth//%d", mcnt);
794 break;
795
796 case stop_nowidth:
797 printf("/stop_nowidth//");
798 p += 2;
799 break;
800
801 case pop_and_fail:
802 printf("/pop_and_fail");
803 break;
804
805 case stop_backtrack:
806 printf("/stop_backtrack//");
807 p += 2;
808 break;
809
810 case duplicate:
811 printf("/duplicate/%d", *p++);
812 break;
813
814 case anychar:
815 printf("/anychar");
816 break;
817
818 case anychar_repeat:
819 printf("/anychar_repeat");
820 break;
821
822 case charset:
823 case charset_not:
824 {
825 register int c;
826
827 printf("/charset%s",
828 (enum regexpcode)*(p - 1) == charset_not ? "_not" : "");
829
830 mcnt = *p++;
831 printf("/%d", mcnt);
832 for (c = 0; c < mcnt; c++) {
833 unsigned bit;
834 unsigned char map_byte = p[c];
835
836 putchar('/');
837
838 for (bit = 0; bit < BYTEWIDTH; bit++)
839 if (map_byte & (1 << bit))
840 printf("%c", c * BYTEWIDTH + bit);
841 }
842 p += mcnt;
843 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
844 putchar('/');
845 while (mcnt--) {
846 print_mbc(EXTRACT_MBC_AND_INCR(p));
847 putchar('-');
848 print_mbc(EXTRACT_MBC_AND_INCR(p));
849 }
850 break;
851 }
852
853 case begline:
854 printf("/begline");
855 break;
856
857 case endline:
858 printf("/endline");
859 break;
860
861 case on_failure_jump:
862 EXTRACT_NUMBER_AND_INCR(mcnt, p);
863 printf("/on_failure_jump//%d", mcnt);
864 break;
865
866 case dummy_failure_jump:
867 EXTRACT_NUMBER_AND_INCR(mcnt, p);
868 printf("/dummy_failure_jump//%d", mcnt);
869 break;
870
871 case push_dummy_failure:
872 printf("/push_dummy_failure");
873 break;
874
875 case finalize_jump:
876 EXTRACT_NUMBER_AND_INCR(mcnt, p);
877 printf("/finalize_jump//%d", mcnt);
878 break;
879
880 case maybe_finalize_jump:
881 EXTRACT_NUMBER_AND_INCR(mcnt, p);
882 printf("/maybe_finalize_jump//%d", mcnt);
883 break;
884
885 case jump_past_alt:
886 EXTRACT_NUMBER_AND_INCR(mcnt, p);
887 printf("/jump_past_alt//%d", mcnt);
888 break;
889
890 case jump:
891 EXTRACT_NUMBER_AND_INCR(mcnt, p);
892 printf("/jump//%d", mcnt);
893 break;
894
895 case succeed_n:
896 EXTRACT_NUMBER_AND_INCR(mcnt, p);
897 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
898 printf("/succeed_n//%d//%d", mcnt, mcnt2);
899 break;
900
901 case jump_n:
902 EXTRACT_NUMBER_AND_INCR(mcnt, p);
903 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
904 printf("/jump_n//%d//%d", mcnt, mcnt2);
905 break;
906
907 case set_number_at:
908 EXTRACT_NUMBER_AND_INCR(mcnt, p);
909 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
910 printf("/set_number_at//%d//%d", mcnt, mcnt2);
911 break;
912
913 case try_next:
914 EXTRACT_NUMBER_AND_INCR(mcnt, p);
915 printf("/try_next//%d", mcnt);
916 break;
917
918 case finalize_push:
919 EXTRACT_NUMBER_AND_INCR(mcnt, p);
920 printf("/finalize_push//%d", mcnt);
921 break;
922
923 case finalize_push_n:
924 EXTRACT_NUMBER_AND_INCR(mcnt, p);
925 EXTRACT_NUMBER_AND_INCR(mcnt2, p);
926 printf("/finalize_push_n//%d//%d", mcnt, mcnt2);
927 break;
928
929 case wordbound:
930 printf("/wordbound");
931 break;
932
933 case notwordbound:
934 printf("/notwordbound");
935 break;
936
937 case wordbeg:
938 printf("/wordbeg");
939 break;
940
941 case wordend:
942 printf("/wordend");
943
944 case wordchar:
945 printf("/wordchar");
946 break;
947
948 case notwordchar:
949 printf("/notwordchar");
950 break;
951
952 case begbuf:
953 printf("/begbuf");
954 break;
955
956 case endbuf:
957 printf("/endbuf");
958 break;
959
960 case endbuf2:
961 printf("/endbuf2");
962 break;
963
964 case begpos:
965 printf("/begpos");
966 break;
967
968 default:
969 printf("?%d", *(p-1));
970 }
971 }
972 printf("/\n");
973 }
974
975
976 static void
977 print_compiled_pattern(bufp)
978 struct re_pattern_buffer *bufp;
979 {
980 unsigned char *buffer = (unsigned char*)bufp->buffer;
981
982 print_partial_compiled_pattern(buffer, buffer + bufp->used);
983 }
984
985 static char*
986 calculate_must_string(start, end)
987 char *start;
988 char *end;
989 {
990 int mcnt;
991 int max = 0;
992 char *p = start;
993 char *pend = end;
994 char *must = 0;
995
996 if (start == NULL) return 0;
997
998
999 while (p < pend) {
1000 switch ((enum regexpcode)*p++) {
1001 case unused:
1002 break;
1003
1004 case exactn:
1005 mcnt = *p;
1006 if (mcnt > max) {
1007 must = p;
1008 max = mcnt;
1009 }
1010 p += mcnt+1;
1011 break;
1012
1013 case start_memory:
1014 case stop_memory:
1015 p += 2;
1016 break;
1017
1018 case duplicate:
1019 p++;
1020 break;
1021
1022 case casefold_on:
1023 case casefold_off:
1024 return 0;
1025
1026 case pop_and_fail:
1027 case anychar:
1028 case anychar_repeat:
1029 case begline:
1030 case endline:
1031 case wordbound:
1032 case notwordbound:
1033 case wordbeg:
1034 case wordend:
1035 case wordchar:
1036 case notwordchar:
1037 case begbuf:
1038 case endbuf:
1039 case endbuf2:
1040 case begpos:
1041 case push_dummy_failure:
1042 case start_paren:
1043 case stop_paren:
1044 case option_set:
1045 break;
1046
1047 case charset:
1048 case charset_not:
1049 mcnt = *p++;
1050 p += mcnt;
1051 mcnt = EXTRACT_UNSIGNED_AND_INCR(p);
1052 while (mcnt--) {
1053 p += 8;
1054 }
1055 break;
1056
1057 case on_failure_jump:
1058 EXTRACT_NUMBER_AND_INCR(mcnt, p);
1059 if (mcnt > 0) p += mcnt;
1060 if ((enum regexpcode)p[-3] == jump) {
1061 p -= 2;
1062 EXTRACT_NUMBER_AND_INCR(mcnt, p);
1063 if (mcnt > 0) p += mcnt;
1064 }
1065 break;
1066
1067 case dummy_failure_jump:
1068 case succeed_n:
1069 case try_next:
1070 case jump:
1071 EXTRACT_NUMBER_AND_INCR(mcnt, p);
1072 if (mcnt > 0) p += mcnt;
1073 break;
1074
1075 case start_nowidth:
1076 case stop_nowidth:
1077 case stop_backtrack:
1078 case finalize_jump:
1079 case maybe_finalize_jump:
1080 case finalize_push:
1081 p += 2;
1082 break;
1083
1084 case jump_n:
1085 case set_number_at:
1086 case finalize_push_n:
1087 p += 4;
1088 break;
1089
1090 default:
1091 break;
1092 }
1093 }
1094 return must;
1095 }
1096
1097 static unsigned int
1098 read_backslash(c)
1099 int c;
1100 {
1101 switch (c) {
1102 case 'n':
1103 return '\n';
1104
1105 case 't':
1106 return '\t';
1107
1108 case 'r':
1109 return '\r';
1110
1111 case 'f':
1112 return '\f';
1113
1114 case 'v':
1115 return '\v';
1116
1117 case 'a':
1118 return '\007';
1119
1120 case 'b':
1121 return '\010';
1122
1123 case 'e':
1124 return '\033';
1125 }
1126 return c;
1127 }
1128
1129 static unsigned int
1130 read_special(p, pend, pp)
1131 const char *p, *pend, **pp;
1132 {
1133 int c;
1134
1135 PATFETCH_RAW(c);
1136 switch (c) {
1137 case 'M':
1138 PATFETCH_RAW(c);
1139 if (c != '-') return -1;
1140 PATFETCH_RAW(c);
1141 *pp = p;
1142 if (c == '\\') {
1143 return read_special(p, pend, pp) | 0x80;
1144 }
1145 else if (c == -1) return ~0;
1146 else {
1147 return ((c & 0xff) | 0x80);
1148 }
1149
1150 case 'C':
1151 PATFETCH_RAW(c);
1152 if (c != '-') return ~0;
1153 case 'c':
1154 PATFETCH_RAW(c);
1155 *pp = p;
1156 if (c == '\\') {
1157 c = read_special(p, pend, pp);
1158 }
1159 else if (c == '?') return 0177;
1160 else if (c == -1) return ~0;
1161 return c & 0x9f;
1162 default:
1163 return read_backslash(c);
1164 }
1165
1166 end_of_pattern:
1167 return ~0;
1168 }
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185 char *
1186 re_compile_pattern(pattern, size, bufp)
1187 const char *pattern;
1188 int size;
1189 struct re_pattern_buffer *bufp;
1190 {
1191 register char *b = bufp->buffer;
1192 register const char *p = pattern;
1193 const char *nextp;
1194 const char *pend = pattern + size;
1195 register unsigned int c, c1 = 0;
1196 const char *p0;
1197 int numlen;
1198 #define ERROR_MSG_MAX_SIZE 200
1199 static char error_msg[ERROR_MSG_MAX_SIZE+1];
1200
1201
1202
1203
1204
1205
1206 char *pending_exact = 0;
1207
1208
1209
1210
1211
1212 char *fixup_alt_jump = 0;
1213
1214
1215
1216
1217 char *laststart = 0;
1218
1219
1220
1221 char zero_times_ok;
1222
1223
1224
1225 char many_times_ok;
1226
1227
1228
1229 char greedy;
1230
1231
1232
1233 char *begalt = b;
1234
1235
1236
1237 const char *beg_interval;
1238
1239
1240 int lower_bound;
1241
1242
1243 int upper_bound;
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253 int stacka[40];
1254 int *stackb = stacka;
1255 int *stackp = stackb;
1256 int *stacke = stackb + 40;
1257
1258
1259
1260
1261
1262 int regnum = 1;
1263
1264 int range = 0;
1265 int had_mbchar = 0;
1266 int had_num_literal = 0;
1267 int had_char_class = 0;
1268
1269 int options = bufp->options;
1270
1271 bufp->fastmap_accurate = 0;
1272 bufp->must = 0;
1273 bufp->must_skip = 0;
1274
1275
1276 init_syntax_once();
1277
1278 if (bufp->allocated == 0) {
1279 bufp->allocated = INIT_BUF_SIZE;
1280
1281 bufp->buffer = (char*)xrealloc(bufp->buffer, INIT_BUF_SIZE);
1282 if (!bufp->buffer) goto memory_exhausted;
1283 begalt = b = bufp->buffer;
1284 }
1285
1286 while (p != pend) {
1287 PATFETCH(c);
1288
1289 switch (c) {
1290 case '$':
1291 if (bufp->options & RE_OPTION_SINGLELINE) {
1292 BUFPUSH(endbuf);
1293 }
1294 else {
1295 p0 = p;
1296
1297
1298
1299 while (p0 != pend) {
1300 if (*p0 == '\\' && p0 + 1 != pend
1301 && (p0[1] == 'b' || p0[1] == 'B'))
1302 p0 += 2;
1303 else
1304 break;
1305 }
1306 BUFPUSH(endline);
1307 }
1308 break;
1309
1310 case '^':
1311 if (bufp->options & RE_OPTION_SINGLELINE)
1312 BUFPUSH(begbuf);
1313 else
1314 BUFPUSH(begline);
1315 break;
1316
1317 case '+':
1318 case '?':
1319 case '*':
1320
1321 if (!laststart) {
1322 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
1323 "invalid regular expression; there's no previous pattern, to which '%c' would define cardinality at %d",
1324 c, p-pattern);
1325 FREE_AND_RETURN(stackb, error_msg);
1326 }
1327
1328
1329 zero_times_ok = c != '+';
1330 many_times_ok = c != '?';
1331 greedy = 1;
1332 if (p != pend) {
1333 PATFETCH(c);
1334 switch (c) {
1335 case '?':
1336 greedy = 0;
1337 break;
1338 case '*':
1339 case '+':
1340 goto nested_meta;
1341 default:
1342 PATUNFETCH;
1343 break;
1344 }
1345 }
1346
1347 repeat:
1348
1349
1350 if (!laststart)
1351 break;
1352
1353 if (greedy && many_times_ok && *laststart == anychar && b - laststart <= 2) {
1354 if (b[-1] == stop_paren)
1355 b--;
1356 if (zero_times_ok)
1357 *laststart = anychar_repeat;
1358 else {
1359 BUFPUSH(anychar_repeat);
1360 }
1361 break;
1362 }
1363
1364
1365 if (many_times_ok) {
1366
1367
1368
1369
1370 GET_BUFFER_SPACE(3);
1371 store_jump(b,greedy?maybe_finalize_jump:finalize_push,laststart-3);
1372 b += 3;
1373 }
1374
1375
1376
1377 GET_BUFFER_SPACE(3);
1378 insert_jump(on_failure_jump, laststart, b + 3, b);
1379 b += 3;
1380
1381 if (zero_times_ok) {
1382 if (greedy == 0) {
1383 GET_BUFFER_SPACE(3);
1384 insert_jump(try_next, laststart, b + 3, b);
1385 b += 3;
1386 }
1387 }
1388 else {
1389
1390
1391
1392
1393
1394 GET_BUFFER_SPACE(3);
1395 insert_jump(dummy_failure_jump, laststart, laststart + 6, b);
1396 b += 3;
1397 }
1398 break;
1399
1400 case '.':
1401 laststart = b;
1402 BUFPUSH(anychar);
1403 break;
1404
1405 case '[':
1406 if (p == pend)
1407 FREE_AND_RETURN(stackb, "invalid regular expression; '[' can't be the last character ie. can't start range at the end of pattern");
1408 while ((b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH)
1409 > bufp->allocated)
1410 EXTEND_BUFFER;
1411
1412 laststart = b;
1413 if (*p == '^') {
1414 BUFPUSH(charset_not);
1415 p++;
1416 }
1417 else
1418 BUFPUSH(charset);
1419 p0 = p;
1420
1421 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
1422
1423 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
1424
1425 had_mbchar = 0;
1426 had_num_literal = 0;
1427 had_char_class = 0;
1428
1429
1430 for (;;) {
1431 int size;
1432 unsigned last = (unsigned)-1;
1433
1434 if ((size = EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH]))
1435 || current_mbctype) {
1436
1437
1438 size = (1 << BYTEWIDTH) / BYTEWIDTH + 2 + size*8 + 8;
1439 while (b + size + 1 > bufp->buffer + bufp->allocated)
1440 EXTEND_BUFFER;
1441 }
1442 range_retry:
1443 if (range && had_char_class) {
1444 FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as an end value of range");
1445 }
1446 PATFETCH(c);
1447
1448 if (c == ']') {
1449 if (p == p0 + 1) {
1450 if (p == pend)
1451 FREE_AND_RETURN(stackb, "invalid regular expression; empty character class");
1452 }
1453 else
1454
1455
1456
1457 break;
1458 }
1459
1460
1461 if (had_char_class && c == '-' && *p != ']')
1462 FREE_AND_RETURN(stackb, "invalid regular expression; can't use character class as a start value of range");
1463 if (ismbchar(c)) {
1464 PATFETCH_MBC(c);
1465 had_mbchar++;
1466 }
1467 had_char_class = 0;
1468
1469
1470 if (c == '\\') {
1471 PATFETCH_RAW(c);
1472 switch (c) {
1473 case 'w':
1474 for (c = 0; c < (1 << BYTEWIDTH); c++) {
1475 if (SYNTAX(c) == Sword ||
1476 (!current_mbctype && SYNTAX(c) == Sword2))
1477 SET_LIST_BIT(c);
1478 }
1479 if (current_mbctype) {
1480 set_list_bits(0x80, 0xffffffff, b);
1481 }
1482 had_char_class = 1;
1483 last = -1;
1484 continue;
1485
1486 case 'W':
1487 for (c = 0; c < (1 << BYTEWIDTH); c++) {
1488 if (SYNTAX(c) != Sword &&
1489 ((current_mbctype && !re_mbctab[c]) ||
1490 (!current_mbctype && SYNTAX(c) != Sword2)))
1491 SET_LIST_BIT(c);
1492 }
1493 had_char_class = 1;
1494 last = -1;
1495 continue;
1496
1497 case 's':
1498 for (c = 0; c < 256; c++)
1499 if (ISSPACE(c))
1500 SET_LIST_BIT(c);
1501 had_char_class = 1;
1502 last = -1;
1503 continue;
1504
1505 case 'S':
1506 for (c = 0; c < 256; c++)
1507 if (!ISSPACE(c))
1508 SET_LIST_BIT(c);
1509 if (current_mbctype)
1510 set_list_bits(0x80, 0xffffffff, b);
1511 had_char_class = 1;
1512 last = -1;
1513 continue;
1514
1515 case 'd':
1516 for (c = '0'; c <= '9'; c++)
1517 SET_LIST_BIT(c);
1518 had_char_class = 1;
1519 last = -1;
1520 continue;
1521
1522 case 'D':
1523 for (c = 0; c < 256; c++)
1524 if (!ISDIGIT(c))
1525 SET_LIST_BIT(c);
1526 if (current_mbctype)
1527 set_list_bits(0x80, 0xffffffff, b);
1528 had_char_class = 1;
1529 last = -1;
1530 continue;
1531
1532 case 'x':
1533 c = scan_hex(p, 2, &numlen);
1534 if (numlen == 0) goto invalid_escape;
1535 p += numlen;
1536 had_num_literal = 1;
1537 break;
1538
1539 case '0': case '1': case '2': case '3': case '4':
1540 case '5': case '6': case '7': case '8': case '9':
1541 PATUNFETCH;
1542 c = scan_oct(p, 3, &numlen);
1543 p += numlen;
1544 had_num_literal = 1;
1545 break;
1546
1547 case 'M':
1548 case 'C':
1549 case 'c':
1550 {
1551 char *pp;
1552
1553 --p;
1554 c = read_special(p, pend, &pp);
1555 if (c > 255) goto invalid_escape;
1556 p = pp;
1557 had_num_literal = 1;
1558 }
1559 break;
1560
1561 default:
1562 c = read_backslash(c);
1563 if (ismbchar(c)) {
1564 PATFETCH_MBC(c);
1565 had_mbchar++;
1566 }
1567 break;
1568 }
1569 }
1570
1571
1572 if (range) {
1573 if (last > c)
1574 goto invalid_pattern;
1575
1576 range = 0;
1577 if (had_mbchar == 0) {
1578 for (;last<=c;last++)
1579 SET_LIST_BIT(last);
1580 }
1581 else if (had_mbchar == 2) {
1582 set_list_bits(last, c, b);
1583 }
1584 else {
1585
1586 goto invalid_pattern;
1587 }
1588 }
1589 else if (p[0] == '-' && p[1] != ']') {
1590 last = c;
1591 PATFETCH(c1);
1592 range = 1;
1593 goto range_retry;
1594 }
1595 else if (c == '[' && *p == ':') {
1596
1597 char str[CHAR_CLASS_MAX_LENGTH + 1];
1598
1599 PATFETCH_RAW(c);
1600 c1 = 0;
1601
1602
1603 if (p == pend)
1604 FREE_AND_RETURN(stackb, "invalid regular expression; re can't end '[[:'");
1605
1606 for (;;) {
1607 PATFETCH (c);
1608 if (c == ':' || c == ']' || p == pend
1609 || c1 == CHAR_CLASS_MAX_LENGTH)
1610 break;
1611 str[c1++] = c;
1612 }
1613 str[c1] = '\0';
1614
1615
1616
1617
1618 if (c == ':' && *p == ']') {
1619 int ch;
1620 char is_alnum = STREQ(str, "alnum");
1621 char is_alpha = STREQ(str, "alpha");
1622 char is_blank = STREQ(str, "blank");
1623 char is_cntrl = STREQ(str, "cntrl");
1624 char is_digit = STREQ(str, "digit");
1625 char is_graph = STREQ(str, "graph");
1626 char is_lower = STREQ(str, "lower");
1627 char is_print = STREQ(str, "print");
1628 char is_punct = STREQ(str, "punct");
1629 char is_space = STREQ(str, "space");
1630 char is_upper = STREQ(str, "upper");
1631 char is_xdigit = STREQ(str, "xdigit");
1632
1633 if (!IS_CHAR_CLASS(str)){
1634 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
1635 "invalid regular expression; [:%s:] is not a character class", str);
1636 FREE_AND_RETURN(stackb, error_msg);
1637 }
1638
1639
1640 PATFETCH(c);
1641
1642 if (p == pend)
1643 FREE_AND_RETURN(stackb, "invalid regular expression; range doesn't have ending ']' after a character class");
1644
1645 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
1646 if ( (is_alnum && ISALNUM(ch))
1647 || (is_alpha && ISALPHA(ch))
1648 || (is_blank && ISBLANK(ch))
1649 || (is_cntrl && ISCNTRL(ch))
1650 || (is_digit && ISDIGIT(ch))
1651 || (is_graph && ISGRAPH(ch))
1652 || (is_lower && ISLOWER(ch))
1653 || (is_print && ISPRINT(ch))
1654 || (is_punct && ISPUNCT(ch))
1655 || (is_space && ISSPACE(ch))
1656 || (is_upper && ISUPPER(ch))
1657 || (is_xdigit && ISXDIGIT(ch)))
1658 SET_LIST_BIT(ch);
1659 }
1660 had_char_class = 1;
1661 }
1662 else {
1663 c1++;
1664 while (c1--)
1665 PATUNFETCH;
1666 SET_LIST_BIT(TRANSLATE_P()?translate['[']:'[');
1667 SET_LIST_BIT(TRANSLATE_P()?translate[':']:':');
1668 had_char_class = 0;
1669 last = ':';
1670 }
1671 }
1672 else if (had_mbchar == 0 && (!current_mbctype || !had_num_literal)) {
1673 SET_LIST_BIT(c);
1674 had_num_literal = 0;
1675 }
1676 else
1677 set_list_bits(c, c, b);
1678 had_mbchar = 0;
1679 }
1680
1681
1682
1683 while ((int)b[-1] > 0 && b[b[-1] - 1] == 0)
1684 b[-1]--;
1685 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
1686 memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH],
1687 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8);
1688 b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*8;
1689 break;
1690
1691 case '(':
1692 {
1693 int old_options = options;
1694 int push_option = 0;
1695 int casefold = 0;
1696
1697 PATFETCH(c);
1698 if (c == '?') {
1699 int negative = 0;
1700
1701 PATFETCH_RAW(c);
1702 switch (c) {
1703 case 'x': case 'm': case 'i': case '-':
1704 for (;;) {
1705 switch (c) {
1706 case '-':
1707 negative = 1;
1708 break;
1709
1710 case ':':
1711 case ')':
1712 break;
1713
1714 case 'x':
1715 if (negative)
1716 options &= ~RE_OPTION_EXTENDED;
1717 else
1718 options |= RE_OPTION_EXTENDED;
1719 break;
1720
1721 case 'm':
1722 if (negative) {
1723 if (options&RE_OPTION_MULTILINE) {
1724 options &= ~RE_OPTION_MULTILINE;
1725 }
1726 }
1727 else if (!(options&RE_OPTION_MULTILINE)) {
1728 options |= RE_OPTION_MULTILINE;
1729 }
1730 push_option = 1;
1731 break;
1732
1733 case 'i':
1734 if (negative) {
1735 if (options&RE_OPTION_IGNORECASE) {
1736 options &= ~RE_OPTION_IGNORECASE;
1737 }
1738 }
1739 else if (!(options&RE_OPTION_IGNORECASE)) {
1740 options |= RE_OPTION_IGNORECASE;
1741 }
1742 casefold = 1;
1743 break;
1744
1745 default:
1746 FREE_AND_RETURN(stackb, "undefined (?...) inline option");
1747 }
1748 if (c == ')') {
1749 c = '#';
1750 break;
1751 }
1752 if (c == ':') break;
1753 PATFETCH_RAW(c);
1754 }
1755 break;
1756
1757 case '#':
1758 for (;;) {
1759 PATFETCH(c);
1760 if (c == ')') break;
1761 }
1762 c = '#';
1763 break;
1764
1765 case ':':
1766 case '=':
1767 case '!':
1768 case '>':
1769 break;
1770
1771 default:
1772 FREE_AND_RETURN(stackb, "undefined (?...) sequence");
1773 }
1774 }
1775 else {
1776 PATUNFETCH;
1777 c = '(';
1778 }
1779 if (c == '#') {
1780 if (push_option) {
1781 BUFPUSH(option_set);
1782 BUFPUSH(options);
1783 }
1784 if (casefold) {
1785 if (options & RE_OPTION_IGNORECASE)
1786 BUFPUSH(casefold_on);
1787 else
1788 BUFPUSH(casefold_off);
1789 }
1790 break;
1791 }
1792 if (stackp+8 >= stacke) {
1793 DOUBLE_STACK(int);
1794 }
1795
1796
1797
1798
1799 *stackp++ = b - bufp->buffer;
1800 *stackp++ = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1801 *stackp++ = begalt - bufp->buffer;
1802 switch (c) {
1803 case '(':
1804 BUFPUSH(start_memory);
1805 BUFPUSH(regnum);
1806 *stackp++ = regnum++;
1807 *stackp++ = b - bufp->buffer;
1808 BUFPUSH(0);
1809
1810 if (regnum >= RE_REG_MAX) goto too_big;
1811 break;
1812
1813 case '=':
1814 case '!':
1815 case '>':
1816 BUFPUSH(start_nowidth);
1817 *stackp++ = b - bufp->buffer;
1818 BUFPUSH(0);
1819 BUFPUSH(0);
1820 if (c != '!') break;
1821
1822 BUFPUSH(on_failure_jump);
1823 *stackp++ = b - bufp->buffer;
1824 BUFPUSH(0);
1825 BUFPUSH(0);
1826 break;
1827
1828 case ':':
1829 BUFPUSH(start_paren);
1830 pending_exact = 0;
1831 default:
1832 break;
1833 }
1834 if (push_option) {
1835 BUFPUSH(option_set);
1836 BUFPUSH(options);
1837 }
1838 if (casefold) {
1839 if (options & RE_OPTION_IGNORECASE)
1840 BUFPUSH(casefold_on);
1841 else
1842 BUFPUSH(casefold_off);
1843 }
1844 *stackp++ = c;
1845 *stackp++ = old_options;
1846 fixup_alt_jump = 0;
1847 laststart = 0;
1848 begalt = b;
1849 }
1850 break;
1851
1852 case ')':
1853 if (stackp == stackb)
1854 FREE_AND_RETURN(stackb, "unmatched )");
1855
1856 pending_exact = 0;
1857 if (fixup_alt_jump) {
1858
1859
1860
1861
1862 BUFPUSH(push_dummy_failure);
1863
1864
1865
1866 store_jump(fixup_alt_jump, jump, b);
1867 }
1868 if (options != stackp[-1]) {
1869 if ((options ^ stackp[-1]) & RE_OPTION_IGNORECASE) {
1870 BUFPUSH((options&RE_OPTION_IGNORECASE)?casefold_off:casefold_on);
1871 }
1872 if ((options ^ stackp[-1]) != RE_OPTION_IGNORECASE) {
1873 BUFPUSH(option_set);
1874 BUFPUSH(stackp[-1]);
1875 }
1876 }
1877 p0 = b;
1878 options = *--stackp;
1879 switch (c = *--stackp) {
1880 case '(':
1881 {
1882 char *loc = bufp->buffer + *--stackp;
1883 *loc = regnum - stackp[-1];
1884 BUFPUSH(stop_memory);
1885 BUFPUSH(stackp[-1]);
1886 BUFPUSH(regnum - stackp[-1]);
1887 stackp--;
1888 }
1889 break;
1890
1891 case '!':
1892 BUFPUSH(pop_and_fail);
1893
1894 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1895 stackp--;
1896
1897 case '=':
1898 BUFPUSH(stop_nowidth);
1899
1900 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1901 BUFPUSH(0);
1902 BUFPUSH(0);
1903 stackp--;
1904 break;
1905
1906 case '>':
1907 BUFPUSH(stop_backtrack);
1908
1909 STORE_NUMBER(bufp->buffer+stackp[-1], b - bufp->buffer - stackp[-1] - 2);
1910 BUFPUSH(0);
1911 BUFPUSH(0);
1912 stackp--;
1913 break;
1914
1915 case ':':
1916 BUFPUSH(stop_paren);
1917 break;
1918
1919 default:
1920 break;
1921 }
1922 begalt = *--stackp + bufp->buffer;
1923 stackp--;
1924 fixup_alt_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
1925 laststart = *--stackp + bufp->buffer;
1926 if (c == '!' || c == '=') laststart = b;
1927 break;
1928
1929 case '|':
1930
1931
1932 GET_BUFFER_SPACE(3);
1933 insert_jump(on_failure_jump, begalt, b + 6, b);
1934 pending_exact = 0;
1935 b += 3;
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952 if (fixup_alt_jump)
1953 store_jump(fixup_alt_jump, jump_past_alt, b);
1954
1955
1956
1957
1958 fixup_alt_jump = b;
1959 GET_BUFFER_SPACE(3);
1960 b += 3;
1961
1962 laststart = 0;
1963 begalt = b;
1964 break;
1965
1966 case '{':
1967
1968 if (!laststart) {
1969 snprintf(error_msg, ERROR_MSG_MAX_SIZE,
1970 "invalid regular expression; there's no previous pattern, to which '{' would define cardinality at %d",
1971 p-pattern);
1972 FREE_AND_RETURN(stackb, error_msg);
1973 }
1974 if( p == pend)
1975 FREE_AND_RETURN(stackb, "invalid regular expression; '{' can't be last character" );
1976
1977 beg_interval = p - 1;
1978
1979 lower_bound = -1;
1980 upper_bound = -1;
1981 GET_UNSIGNED_NUMBER(lower_bound);
1982 if (c == ',') {
1983 GET_UNSIGNED_NUMBER(upper_bound);
1984 }
1985 else
1986
1987 upper_bound = lower_bound;
1988
1989 if (lower_bound < 0 || c != '}')
1990 goto unfetch_interval;
1991
1992 if (lower_bound >= RE_DUP_MAX || upper_bound >= RE_DUP_MAX)
1993 FREE_AND_RETURN(stackb, "too big quantifier in {,}");
1994 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1995 if (lower_bound > upper_bound)
1996 FREE_AND_RETURN(stackb, "can't do {n,m} with n > m");
1997
1998 beg_interval = 0;
1999 pending_exact = 0;
2000
2001 greedy = 1;
2002 if (p != pend) {
2003 PATFETCH(c);
2004 if (c == '?') greedy = 0;
2005 else PATUNFETCH;
2006 }
2007
2008 if (lower_bound == 0) {
2009 zero_times_ok = 1;
2010 if (upper_bound == RE_DUP_MAX) {
2011 many_times_ok = 1;
2012 goto repeat;
2013 }
2014 if (upper_bound == 1) {
2015 many_times_ok = 0;
2016 goto repeat;
2017 }
2018 }
2019 if (lower_bound == 1) {
2020 if (upper_bound == 1) {
2021
2022 break;
2023 }
2024 if (upper_bound == RE_DUP_MAX) {
2025 many_times_ok = 1;
2026 zero_times_ok = 0;
2027 goto repeat;
2028 }
2029 }
2030
2031
2032
2033
2034
2035 if (upper_bound == 0) {
2036 GET_BUFFER_SPACE(3);
2037 insert_jump(jump, laststart, b + 3, b);
2038 b += 3;
2039 break;
2040 }
2041
2042
2043 if (lower_bound == upper_bound) {
2044 int mcnt;
2045 int skip_stop_paren = 0;
2046
2047 if (b[-1] == stop_paren) {
2048 skip_stop_paren = 1;
2049 b--;
2050 }
2051
2052 if (*laststart == exactn && laststart[1]+2 == b - laststart
2053 && laststart[1]*lower_bound < 256) {
2054 mcnt = laststart[1];
2055 GET_BUFFER_SPACE((lower_bound-1)*mcnt);
2056 laststart[1] = lower_bound*mcnt;
2057 while (--lower_bound) {
2058 memcpy(b, laststart+2, mcnt);
2059 b += mcnt;
2060 }
2061 if (skip_stop_paren) BUFPUSH(stop_paren);
2062 break;
2063 }
2064
2065 if (lower_bound < 5 && b - laststart < 10) {
2066
2067
2068 mcnt = b - laststart;
2069 GET_BUFFER_SPACE((lower_bound-1)*mcnt);
2070 while (--lower_bound) {
2071 memcpy(b, laststart, mcnt);
2072 b += mcnt;
2073 }
2074 if (skip_stop_paren) BUFPUSH(stop_paren);
2075 break;
2076 }
2077 if (skip_stop_paren) b++;
2078 }
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089 {
2090
2091 unsigned nbytes = upper_bound == 1 ? 10 : 20;
2092
2093 GET_BUFFER_SPACE(nbytes);
2094
2095
2096
2097
2098
2099 insert_jump_n(succeed_n, laststart, b + (nbytes/2),
2100 b, lower_bound);
2101 b += 5;
2102
2103
2104
2105
2106
2107 insert_op_2(set_number_at, laststart, b, 5, lower_bound);
2108 b += 5;
2109
2110 if (upper_bound > 1) {
2111
2112
2113
2114
2115
2116
2117
2118 GET_BUFFER_SPACE(5);
2119 store_jump_n(b, greedy?jump_n:finalize_push_n, laststart + 5,
2120 upper_bound - 1);
2121 b += 5;
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137 insert_op_2(set_number_at, laststart, b, b - laststart,
2138 upper_bound - 1);
2139 b += 5;
2140 }
2141 }
2142 break;
2143
2144 unfetch_interval:
2145
2146 p = beg_interval;
2147 beg_interval = 0;
2148
2149
2150 PATFETCH(c);
2151 goto normal_char;
2152
2153 case '\\':
2154 if (p == pend)
2155 FREE_AND_RETURN(stackb, "invalid regular expression; '\\' can't be last character");
2156
2157
2158
2159 PATFETCH_RAW(c);
2160 switch (c) {
2161 case 's':
2162 case 'S':
2163 case 'd':
2164 case 'D':
2165 while (b - bufp->buffer + 9 + (1 << BYTEWIDTH) / BYTEWIDTH
2166 > bufp->allocated)
2167 EXTEND_BUFFER;
2168
2169 laststart = b;
2170 if (c == 's' || c == 'd') {
2171 BUFPUSH(charset);
2172 }
2173 else {
2174 BUFPUSH(charset_not);
2175 }
2176
2177 BUFPUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2178 memset(b, 0, (1 << BYTEWIDTH) / BYTEWIDTH + 2);
2179 if (c == 's' || c == 'S') {
2180 SET_LIST_BIT(' ');
2181 SET_LIST_BIT('\t');
2182 SET_LIST_BIT('\n');
2183 SET_LIST_BIT('\r');
2184 SET_LIST_BIT('\f');
2185 }
2186 else {
2187 char cc;
2188
2189 for (cc = '0'; cc <= '9'; cc++) {
2190 SET_LIST_BIT(cc);
2191 }
2192 }
2193
2194 while ((int)b[-1] > 0 && b[b[-1] - 1] == 0)
2195 b[-1]--;
2196 if (b[-1] != (1 << BYTEWIDTH) / BYTEWIDTH)
2197 memmove(&b[b[-1]], &b[(1 << BYTEWIDTH) / BYTEWIDTH],
2198 2 + EXTRACT_UNSIGNED(&b[(1 << BYTEWIDTH) / BYTEWIDTH])*8);
2199 b += b[-1] + 2 + EXTRACT_UNSIGNED(&b[b[-1]])*8;
2200 break;
2201
2202 case 'w':
2203 laststart = b;
2204 BUFPUSH(wordchar);
2205 break;
2206
2207 case 'W':
2208 laststart = b;
2209 BUFPUSH(notwordchar);
2210 break;
2211
2212 #ifndef RUBY
2213 case '<':
2214 BUFPUSH(wordbeg);
2215 break;
2216
2217 case '>':
2218 BUFPUSH(wordend);
2219 break;
2220 #endif
2221
2222 case 'b':
2223 BUFPUSH(wordbound);
2224 break;
2225
2226 case 'B':
2227 BUFPUSH(notwordbound);
2228 break;
2229
2230 case 'A':
2231 BUFPUSH(begbuf);
2232 break;
2233
2234 case 'Z':
2235 if ((bufp->options & RE_OPTION_SINGLELINE) == 0) {
2236 BUFPUSH(endbuf2);
2237 break;
2238 }
2239
2240 case 'z':
2241 BUFPUSH(endbuf);
2242 break;
2243
2244 case 'G':
2245 BUFPUSH(begpos);
2246 break;
2247
2248
2249 case 'x':
2250 had_mbchar = 0;
2251 c = scan_hex(p, 2, &numlen);
2252 if (numlen == 0) goto invalid_escape;
2253 p += numlen;
2254 had_num_literal = 1;
2255 goto numeric_char;
2256
2257
2258 case '0':
2259 had_mbchar = 0;
2260 c = scan_oct(p, 2, &numlen);
2261 p += numlen;
2262 had_num_literal = 1;
2263 goto numeric_char;
2264
2265
2266 case '1': case '2': case '3':
2267 case '4': case '5': case '6':
2268 case '7': case '8': case '9':
2269 PATUNFETCH;
2270 p0 = p;
2271
2272 had_mbchar = 0;
2273 c1 = 0;
2274 GET_UNSIGNED_NUMBER(c1);
2275 if (!ISDIGIT(c)) PATUNFETCH;
2276
2277 if (9 < c1 && c1 >= regnum) {
2278
2279 c = scan_oct(p0, 3, &numlen) & 0xff;
2280 p = p0 + numlen;
2281 c1 = 0;
2282 had_num_literal = 1;
2283 goto numeric_char;
2284 }
2285
2286 laststart = b;
2287 BUFPUSH(duplicate);
2288 BUFPUSH(c1);
2289 break;
2290
2291 case 'M':
2292 case 'C':
2293 case 'c':
2294 p0 = --p;
2295 c = read_special(p, pend, &p0);
2296 if (c > 255) goto invalid_escape;
2297 p = p0;
2298 had_num_literal = 1;
2299 goto numeric_char;
2300
2301 default:
2302 c = read_backslash(c);
2303 goto normal_char;
2304 }
2305 break;
2306
2307 case '#':
2308 if (options & RE_OPTION_EXTENDED) {
2309 while (p != pend) {
2310 PATFETCH(c);
2311 if (c == '\n') break;
2312 }
2313 break;
2314 }
2315 goto normal_char;
2316
2317 case ' ':
2318 case '\t':
2319 case '\f':
2320 case '\r':
2321 case '\n':
2322 if (options & RE_OPTION_EXTENDED)
2323 break;
2324
2325 default:
2326 normal_char:
2327 had_mbchar = 0;
2328 if (ismbchar(c)) {
2329 had_mbchar = 1;
2330 c1 = p - pattern;
2331 }
2332 numeric_char:
2333 nextp = p + mbclen(c) - 1;
2334 if (!pending_exact || pending_exact + *pending_exact + 1 != b
2335 || *pending_exact >= (c1 ? 0176 : 0177)
2336 || *nextp == '+' || *nextp == '?'
2337 || *nextp == '*' || *nextp == '^'
2338 || *nextp == '{') {
2339 laststart = b;
2340 BUFPUSH(exactn);
2341 pending_exact = b;
2342 BUFPUSH(0);
2343 }
2344 if (had_num_literal || c == 0xff) {
2345 BUFPUSH(0xff);
2346 (*pending_exact)++;
2347 had_num_literal = 0;
2348 }
2349 BUFPUSH(c);
2350 (*pending_exact)++;
2351 if (had_mbchar) {
2352 int len = mbclen(c) - 1;
2353 while (len--) {
2354 PATFETCH_RAW(c);
2355 BUFPUSH(c);
2356 (*pending_exact)++;
2357 }
2358 }
2359 }
2360 }
2361
2362 if (fixup_alt_jump)
2363 store_jump(fixup_alt_jump, jump, b);
2364
2365 if (stackp != stackb)
2366 FREE_AND_RETURN(stackb, "unmatched (");
2367
2368
2369 laststart = bufp->buffer;
2370 if (laststart != b) {
2371 if (*laststart == start_memory) laststart += 3;
2372 if (*laststart == dummy_failure_jump) laststart += 3;
2373 else if (*laststart == try_next) laststart += 3;
2374 if (*laststart == anychar_repeat) {
2375 bufp->options |= RE_OPTIMIZE_ANCHOR;
2376 }
2377 }
2378
2379 bufp->used = b - bufp->buffer;
2380 bufp->re_nsub = regnum;
2381 laststart = bufp->buffer;
2382 if (laststart != b) {
2383 if (*laststart == start_memory) laststart += 3;
2384 if (*laststart == exactn) {
2385 bufp->options |= RE_OPTIMIZE_EXACTN;
2386 bufp->must = laststart+1;
2387 }
2388 }
2389 if (!bufp->must) {
2390 bufp->must = calculate_must_string(bufp->buffer, b);
2391 }
2392 if (current_mbctype == MBCTYPE_SJIS) bufp->options |= RE_OPTIMIZE_NO_BM;
2393 else if (bufp->must) {
2394 int i;
2395 int len = (unsigned char)bufp->must[0];
2396
2397 for (i=1; i<len; i++) {
2398 if ((unsigned char)bufp->must[i] == 0xff ||
2399 (current_mbctype && ismbchar(bufp->must[i]))) {
2400 bufp->options |= RE_OPTIMIZE_NO_BM;
2401 break;
2402 }
2403 }
2404 if (!(bufp->options & RE_OPTIMIZE_NO_BM)) {
2405 bufp->must_skip = (int *) xmalloc((1 << BYTEWIDTH)*sizeof(int));
2406 bm_init_skip(bufp->must_skip, (unsigned char*)bufp->must+1,
2407 (unsigned char)bufp->must[0],
2408 (unsigned char*)(MAY_TRANSLATE()?translate:0));
2409 }
2410 }
2411
2412 bufp->regstart = TMALLOC(regnum, unsigned char*);
2413 bufp->regend = TMALLOC(regnum, unsigned char*);
2414 bufp->old_regstart = TMALLOC(regnum, unsigned char*);
2415 bufp->old_regend = TMALLOC(regnum, unsigned char*);
2416 bufp->reg_info = TMALLOC(regnum, register_info_type);
2417 bufp->best_regstart = TMALLOC(regnum, unsigned char*);
2418 bufp->best_regend = TMALLOC(regnum, unsigned char*);
2419 FREE_AND_RETURN(stackb, 0);
2420
2421 invalid_pattern:
2422 FREE_AND_RETURN(stackb, "invalid regular expression");
2423
2424 end_of_pattern:
2425 FREE_AND_RETURN(stackb, "premature end of regular expression");
2426
2427 too_big:
2428 FREE_AND_RETURN(stackb, "regular expression too big");
2429
2430 memory_exhausted:
2431 FREE_AND_RETURN(stackb, "memory exhausted");
2432
2433 nested_meta:
2434 FREE_AND_RETURN(stackb, "nested *?+ in regexp");
2435
2436 invalid_escape:
2437 FREE_AND_RETURN(stackb, "Invalid escape character syntax");
2438 }
2439
2440 void
2441 re_free_pattern(bufp)
2442 struct re_pattern_buffer *bufp;
2443 {
2444 xfree(bufp->buffer);
2445 xfree(bufp->fastmap);
2446 if (bufp->must_skip) xfree(bufp->must_skip);
2447
2448 xfree(bufp->regstart);
2449 xfree(bufp->regend);
2450 xfree(bufp->old_regstart);
2451 xfree(bufp->old_regend);
2452 xfree(bufp->best_regstart);
2453 xfree(bufp->best_regend);
2454 xfree(bufp->reg_info);
2455 xfree(bufp);
2456 }
2457
2458
2459
2460
2461
2462 static void
2463 store_jump(from, opcode, to)
2464 char *from, *to;
2465 int opcode;
2466 {
2467 from[0] = (char)opcode;
2468 STORE_NUMBER(from + 1, to - (from + 3));
2469 }
2470
2471
2472
2473
2474
2475
2476
2477
2478 static void
2479 insert_jump(op, from, to, current_end)
2480 int op;
2481 char *from, *to, *current_end;
2482 {
2483 register char *pfrom = current_end;
2484 register char *pto = current_end + 3;
2485
2486 while (pfrom != from)
2487 *--pto = *--pfrom;
2488 store_jump(from, op, to);
2489 }
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500 static void
2501 store_jump_n(from, opcode, to, n)
2502 char *from, *to;
2503 int opcode;
2504 unsigned n;
2505 {
2506 from[0] = (char)opcode;
2507 STORE_NUMBER(from + 1, to - (from + 3));
2508 STORE_NUMBER(from + 3, n);
2509 }
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520 static void
2521 insert_jump_n(op, from, to, current_end, n)
2522 int op;
2523 char *from, *to, *current_end;
2524 unsigned n;
2525 {
2526 register char *pfrom = current_end;
2527 register char *pto = current_end + 5;
2528
2529 while (pfrom != from)
2530 *--pto = *--pfrom;
2531 store_jump_n(from, op, to, n);
2532 }
2533
2534
2535
2536
2537
2538
2539
2540
2541 static void
2542 insert_op(op, there, current_end)
2543 int op;
2544 char *there, *current_end;
2545 {
2546 register char *pfrom = current_end;
2547 register char *pto = current_end + 1;
2548
2549 while (pfrom != there)
2550 *--pto = *--pfrom;
2551
2552 there[0] = (char)op;
2553 }
2554
2555
2556
2557
2558
2559
2560
2561
2562 static void
2563 insert_op_2(op, there, current_end, num_1, num_2)
2564 int op;
2565 char *there, *current_end;
2566 int num_1, num_2;
2567 {
2568 register char *pfrom = current_end;
2569 register char *pto = current_end + 5;
2570
2571 while (pfrom != there)
2572 *--pto = *--pfrom;
2573
2574 there[0] = (char)op;
2575 STORE_NUMBER(there + 1, num_1);
2576 STORE_NUMBER(there + 3, num_2);
2577 }
2578
2579
2580 #define trans_eq(c1, c2, translate) (translate?(translate[c1]==translate[c2]):((c1)==(c2)))
2581 static int
2582 slow_match(little, lend, big, bend, translate)
2583 unsigned char *little, *lend;
2584 unsigned char *big, *bend;
2585 unsigned char *translate;
2586 {
2587 int c;
2588
2589 while (little < lend && big < bend) {
2590 c = *little++;
2591 if (c == 0xff)
2592 c = *little++;
2593 if (!trans_eq(*big++, c, translate)) break;
2594 }
2595 if (little == lend) return 1;
2596 return 0;
2597 }
2598
2599 static int
2600 slow_search(little, llen, big, blen, translate)
2601 unsigned char *little;
2602 int llen;
2603 unsigned char *big;
2604 int blen;
2605 char *translate;
2606 {
2607 unsigned char *bsave = big;
2608 unsigned char *bend = big + blen;
2609 register int c;
2610 int fescape = 0;
2611
2612 c = *little;
2613 if (c == 0xff) {
2614 c = little[1];
2615 fescape = 1;
2616 }
2617 else if (translate && !ismbchar(c)) {
2618 c = translate[c];
2619 }
2620
2621 while (big < bend) {
2622
2623 if (fescape) {
2624 while (big < bend) {
2625 if (*big == c) break;
2626 big++;
2627 }
2628 }
2629 else if (translate && !ismbchar(c)) {
2630 while (big < bend) {
2631 if (ismbchar(*big)) big+=mbclen(*big)-1;
2632 else if (translate[*big] == c) break;
2633 big++;
2634 }
2635 }
2636 else {
2637 while (big < bend) {
2638 if (*big == c) break;
2639 if (ismbchar(*big)) big+=mbclen(*big)-1;
2640 big++;
2641 }
2642 }
2643
2644 if (slow_match(little, little+llen, big, bend, translate))
2645 return big - bsave;
2646
2647 big+=mbclen(*big);
2648 }
2649 return -1;
2650 }
2651
2652 static void
2653 bm_init_skip(skip, pat, m, translate)
2654 int *skip;
2655 unsigned char *pat;
2656 int m;
2657 const unsigned char *translate;
2658 {
2659 int j, c;
2660
2661 for (c=0; c<256; c++) {
2662 skip[c] = m;
2663 }
2664 if (translate) {
2665 for (j=0; j<m-1; j++) {
2666 skip[translate[pat[j]]] = m-1-j;
2667 }
2668 }
2669 else {
2670 for (j=0; j<m-1; j++) {
2671 skip[pat[j]] = m-1-j;
2672 }
2673 }
2674 }
2675
2676 static int
2677 bm_search(little, llen, big, blen, skip, translate)
2678 unsigned char *little;
2679 int llen;
2680 unsigned char *big;
2681 int blen;
2682 int *skip;
2683 unsigned char *translate;
2684 {
2685 int i, j, k;
2686
2687 i = llen-1;
2688 if (translate) {
2689 while (i < blen) {
2690 k = i;
2691 j = llen-1;
2692 while (j >= 0 && translate[big[k]] == translate[little[j]]) {
2693 k--;
2694 j--;
2695 }
2696 if (j < 0) return k+1;
2697
2698 i += skip[translate[big[i]]];
2699 }
2700 return -1;
2701 }
2702 while (i < blen) {
2703 k = i;
2704 j = llen-1;
2705 while (j >= 0 && big[k] == little[j]) {
2706 k--;
2707 j--;
2708 }
2709 if (j < 0) return k+1;
2710
2711 i += skip[big[i]];
2712 }
2713 return -1;
2714 }
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724 void
2725 re_compile_fastmap(bufp)
2726 struct re_pattern_buffer *bufp;
2727 {
2728 unsigned char *pattern = (unsigned char*)bufp->buffer;
2729 int size = bufp->used;
2730 register char *fastmap = bufp->fastmap;
2731 register unsigned char *p = pattern;
2732 register unsigned char *pend = pattern + size;
2733 register int j, k;
2734 unsigned is_a_succeed_n;
2735
2736
2737 unsigned char *stacka[NFAILURES];
2738 unsigned char **stackb = stacka;
2739 unsigned char **stackp = stackb;
2740 unsigned char **stacke = stackb + NFAILURES;
2741 int options = bufp->options;
2742
2743 memset(fastmap, 0, (1 << BYTEWIDTH));
2744 bufp->fastmap_accurate = 1;
2745 bufp->can_be_null = 0;
2746
2747 while (p) {
2748 is_a_succeed_n = 0;
2749 if (p == pend) {
2750 bufp->can_be_null = 1;
2751 break;
2752 }
2753 #ifdef SWITCH_ENUM_BUG
2754 switch ((int)((enum regexpcode)*p++))
2755 #else
2756 switch ((enum regexpcode)*p++)
2757 #endif
2758 {
2759 case exactn:
2760 if (p[1] == 0xff) {
2761 if (TRANSLATE_P())
2762 fastmap[translate[p[2]]] = 2;
2763 else
2764 fastmap[p[2]] = 2;
2765 bufp->options |= RE_OPTIMIZE_BMATCH;
2766 }
2767 else if (TRANSLATE_P())
2768 fastmap[translate[p[1]]] = 1;
2769 else
2770 fastmap[p[1]] = 1;
2771 break;
2772
2773 case begline:
2774 case begbuf:
2775 case begpos:
2776 case endbuf:
2777 case endbuf2:
2778 case wordbound:
2779 case notwordbound:
2780 case wordbeg:
2781 case wordend:
2782 case pop_and_fail:
2783 case push_dummy_failure:
2784 case start_paren:
2785 case stop_paren:
2786 continue;
2787
2788 case casefold_on:
2789 bufp->options |= RE_MAY_IGNORECASE;
2790 case casefold_off:
2791 options ^= RE_OPTION_IGNORECASE;
2792 continue;
2793
2794 case option_set:
2795 options = *p++;
2796 continue;
2797
2798 case endline:
2799 if (TRANSLATE_P())
2800 fastmap[translate['\n']] = 1;
2801 else
2802 fastmap['\n'] = 1;
2803 if ((options & RE_OPTION_SINGLELINE) == 0 && bufp->can_be_null == 0)
2804 bufp->can_be_null = 2;
2805 break;
2806
2807 case jump_n:
2808 case finalize_jump:
2809 case maybe_finalize_jump:
2810 case jump:
2811 case jump_past_alt:
2812 case dummy_failure_jump:
2813 case finalize_push:
2814 case finalize_push_n:
2815 EXTRACT_NUMBER_AND_INCR(j, p);
2816 p += j;
2817 if (j > 0)
2818 continue;
2819
2820
2821
2822
2823
2824
2825
2826 if ((enum regexpcode)*p != on_failure_jump
2827 && (enum regexpcode)*p != try_next
2828 && (enum regexpcode)*p != succeed_n)
2829 continue;
2830 p++;
2831 EXTRACT_NUMBER_AND_INCR(j, p);
2832 p += j;
2833 if (stackp != stackb && *stackp == p)
2834 stackp--;
2835 continue;
2836
2837 case try_next:
2838 case start_nowidth:
2839 case stop_nowidth:
2840 case stop_backtrack:
2841 p += 2;
2842 continue;
2843
2844 case succeed_n:
2845 is_a_succeed_n = 1;
2846
2847 EXTRACT_NUMBER(k, p + 2);
2848
2849 if (k != 0) {
2850 p += 4;
2851 continue;
2852 }
2853
2854
2855 case on_failure_jump:
2856 EXTRACT_NUMBER_AND_INCR(j, p);
2857 if (p + j < pend) {
2858 if (stackp == stacke) {
2859 EXPAND_FAIL_STACK();
2860 }
2861 *++stackp = p + j;
2862 }
2863 else {
2864 bufp->can_be_null = 1;
2865 }
2866 if (is_a_succeed_n)
2867 EXTRACT_NUMBER_AND_INCR(k, p);
2868 continue;
2869
2870 case set_number_at:
2871 p += 4;
2872 continue;
2873
2874 case start_memory:
2875 case stop_memory:
2876 p += 2;
2877 continue;
2878
2879 case duplicate:
2880 bufp->can_be_null = 1;
2881 if (*p >= bufp->re_nsub) break;
2882 fastmap['\n'] = 1;
2883 case anychar_repeat:
2884 case anychar:
2885 for (j = 0; j < (1 << BYTEWIDTH); j++) {
2886 if (j != '\n' || (options & RE_OPTION_MULTILINE))
2887 fastmap[j] = 1;
2888 }
2889 if (bufp->can_be_null) {
2890 FREE_AND_RETURN_VOID(stackb);
2891 }
2892
2893
2894 if ((enum regexpcode)p[-1] == anychar_repeat) {
2895 continue;
2896 }
2897 break;
2898
2899 case wordchar:
2900 for (j = 0; j < 0x80; j++) {
2901 if (SYNTAX(j) == Sword)
2902 fastmap[j] = 1;
2903 }
2904 switch (current_mbctype) {
2905 case MBCTYPE_ASCII:
2906 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
2907 if (SYNTAX(j) == Sword2)
2908 fastmap[j] = 1;
2909 }
2910 break;
2911 case MBCTYPE_EUC:
2912 case MBCTYPE_SJIS:
2913 case MBCTYPE_UTF8:
2914 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
2915 if (re_mbctab[j])
2916 fastmap[j] = 1;
2917 }
2918 break;
2919 }
2920 break;
2921
2922 case notwordchar:
2923 for (j = 0; j < 0x80; j++)
2924 if (SYNTAX(j) != Sword)
2925 fastmap[j] = 1;
2926 switch (current_mbctype) {
2927 case MBCTYPE_ASCII:
2928 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
2929 if (SYNTAX(j) != Sword2)
2930 fastmap[j] = 1;
2931 }
2932 break;
2933 case MBCTYPE_EUC:
2934 case MBCTYPE_SJIS:
2935 case MBCTYPE_UTF8:
2936 for (j = 0x80; j < (1 << BYTEWIDTH); j++) {
2937 if (!re_mbctab[j])
2938 fastmap[j] = 1;
2939 }
2940 break;
2941 }
2942 break;
2943
2944 case charset:
2945
2946
2947 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2948 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) {
2949 int tmp = TRANSLATE_P()?translate[j]:j;
2950 fastmap[tmp] = 1;
2951 }
2952 {
2953 unsigned short size;
2954 unsigned long c, beg, end;
2955
2956 p += p[-1] + 2;
2957 size = EXTRACT_UNSIGNED(&p[-2]);
2958 for (j = 0; j < (int)size; j++) {
2959 c = EXTRACT_MBC(&p[j*8]);
2960 beg = WC2MBC1ST(c);
2961 c = EXTRACT_MBC(&p[j*8+4]);
2962 end = WC2MBC1ST(c);
2963
2964 while (beg <= end) {
2965
2966
2967 if (c < 0x100) {
2968 fastmap[beg] = 2;
2969 bufp->options |= RE_OPTIMIZE_BMATCH;
2970 }
2971 else if (ismbchar(beg))
2972 fastmap[beg] = 1;
2973 beg++;
2974 }
2975 }
2976 }
2977 break;
2978
2979 case charset_not:
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2992 if (!ismbchar(j))
2993 fastmap[j] = 1;
2994
2995 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2996 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) {
2997 if (!ismbchar(j))
2998 fastmap[j] = 1;
2999 }
3000 {
3001 unsigned short size;
3002 unsigned long c, beg;
3003 int num_literal = 0;
3004
3005 p += p[-1] + 2;
3006 size = EXTRACT_UNSIGNED(&p[-2]);
3007 if (size == 0) {
3008 for (j = 0x80; j < (1 << BYTEWIDTH); j++)
3009 if (ismbchar(j))
3010 fastmap[j] = 1;
3011 break;
3012 }
3013 for (j = 0,c = 0;j < (int)size; j++) {
3014 unsigned int cc = EXTRACT_MBC(&p[j*8]);
3015 beg = WC2MBC1ST(cc);
3016 while (c <= beg) {
3017 if (ismbchar(c))
3018 fastmap[c] = 1;
3019 c++;
3020 }
3021
3022 cc = EXTRACT_MBC(&p[j*8+4]);
3023 if (cc < 0xff) {
3024 num_literal = 1;
3025 while (c <= cc) {
3026 if (ismbchar(c))
3027 fastmap[c] = 1;
3028 c++;
3029 }
3030 }
3031 c = WC2MBC1ST(cc);
3032 }
3033
3034 for (j = c; j < (1 << BYTEWIDTH); j++) {
3035 if (num_literal)
3036 fastmap[j] = 1;
3037 if (ismbchar(j))
3038 fastmap[j] = 1;
3039 }
3040 }
3041 break;
3042
3043 case unused:
3044 break;
3045 }
3046
3047
3048
3049
3050
3051 if (stackp != stackb)
3052 p = *stackp--;
3053 else
3054 break;
3055 }
3056 FREE_AND_RETURN_VOID(stackb);
3057 }
3058
3059
3060 int
3061 re_adjust_startpos(bufp, string, size, startpos, range)
3062 struct re_pattern_buffer *bufp;
3063 const char *string;
3064 int size, startpos, range;
3065 {
3066
3067 if (!bufp->fastmap_accurate) {
3068 re_compile_fastmap(bufp);
3069 }
3070
3071
3072 if (current_mbctype && startpos>0 && !(bufp->options&RE_OPTIMIZE_BMATCH)) {
3073 int i = mbc_startpos(string, startpos);
3074
3075 if (i < startpos) {
3076 if (range > 0) {
3077 startpos = i + mbclen(string[i]);
3078 }
3079 else {
3080 int len = mbclen(string[i]);
3081 if (i + len <= startpos)
3082 startpos = i + len;
3083 else
3084 startpos = i;
3085 }
3086 }
3087 }
3088 return startpos;
3089 }
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104 int
3105 re_search(bufp, string, size, startpos, range, regs)
3106 struct re_pattern_buffer *bufp;
3107 const char *string;
3108 int size, startpos, range;
3109 struct re_registers *regs;
3110 {
3111 register char *fastmap = bufp->fastmap;
3112 int val, anchor = 0;
3113
3114
3115 if (startpos < 0 || startpos > size)
3116 return -1;
3117
3118
3119 if (fastmap && !bufp->fastmap_accurate) {
3120 re_compile_fastmap(bufp);
3121 }
3122
3123
3124
3125
3126 if (bufp->used > 0) {
3127 switch ((enum regexpcode)bufp->buffer[0]) {
3128 case begbuf:
3129 begbuf_match:
3130 if (range > 0) {
3131 if (startpos > 0) return -1;
3132 else {
3133 val = re_match(bufp, string, size, 0, regs);
3134 if (val >= 0) return 0;
3135 return val;
3136 }
3137 }
3138 break;
3139
3140 case begline:
3141 anchor = 1;
3142 break;
3143
3144 case begpos:
3145 val = re_match(bufp, string, size, startpos, regs);
3146 if (val >= 0) return startpos;
3147 return val;
3148
3149 default:
3150 break;
3151 }
3152 }
3153 if (bufp->options & RE_OPTIMIZE_ANCHOR) {
3154 if (bufp->options&RE_OPTION_SINGLELINE) {
3155 goto begbuf_match;
3156 }
3157 anchor = 1;
3158 }
3159
3160 if (bufp->must) {
3161 int len = ((unsigned char*)bufp->must)[0];
3162 int pos, pbeg, pend;
3163
3164 pbeg = startpos;
3165 pend = startpos + range;
3166 if (pbeg > pend) {
3167 pos = pend; pend = pbeg; pbeg = pos;
3168 }
3169 pend = size;
3170 if (bufp->options & RE_OPTIMIZE_NO_BM) {
3171 pos = slow_search(bufp->must+1, len,
3172 string+pbeg, pend-pbeg,
3173 MAY_TRANSLATE()?translate:0);
3174 }
3175 else {
3176 pos = bm_search(bufp->must+1, len,
3177 string+pbeg, pend-pbeg,
3178 bufp->must_skip,
3179 MAY_TRANSLATE()?translate:0);
3180 }
3181 if (pos == -1) return -1;
3182 if (range > 0 && (bufp->options & RE_OPTIMIZE_EXACTN)) {
3183 startpos += pos;
3184 range -= pos;
3185 if (range < 0) return -1;
3186 }
3187 }
3188
3189 for (;;) {
3190
3191
3192
3193
3194
3195
3196 if (fastmap && startpos < size
3197 && bufp->can_be_null != 1 && !(anchor && startpos == 0)) {
3198 if (range > 0) {
3199 register unsigned char *p, c;
3200 int irange = range;
3201
3202 p = (unsigned char*)string+startpos;
3203
3204 while (range > 0) {
3205 c = *p++;
3206 if (ismbchar(c)) {
3207 int len;
3208
3209 if (fastmap[c])
3210 break;
3211 len = mbclen(c) - 1;
3212 while (len--) {
3213 c = *p++;
3214 range--;
3215 if (fastmap[c] == 2)
3216 goto startpos_adjust;
3217 }
3218 }
3219 else {
3220 if (fastmap[MAY_TRANSLATE() ? translate[c] : c])
3221 break;
3222 }
3223 range--;
3224 }
3225 startpos_adjust:
3226 startpos += irange - range;
3227 }
3228 else {
3229 register unsigned char c;
3230
3231 c = string[startpos];
3232 c &= 0xff;
3233 if (MAY_TRANSLATE() ? !fastmap[translate[c]] : !fastmap[c])
3234 goto advance;
3235 }
3236 }
3237
3238 if (startpos > size) return -1;
3239 if ((anchor || !bufp->can_be_null) && range > 0 && size > 0 && startpos == size)
3240 return -1;
3241 val = re_match(bufp, string, size, startpos, regs);
3242 if (val >= 0) return startpos;
3243 if (val == -2) return -2;
3244
3245 #ifndef NO_ALLOCA
3246 #ifdef C_ALLOCA
3247 alloca(0);
3248 #endif
3249 #endif
3250
3251 if (range > 0) {
3252 if (anchor && startpos < size &&
3253 (startpos < 1 || string[startpos-1] != '\n')) {
3254 while (range > 0 && string[startpos] != '\n') {
3255 range--;
3256 startpos++;
3257 }
3258 }
3259 }
3260
3261 advance:
3262 if (!range)
3263 break;
3264 else if (range > 0) {
3265 const char *d = string + startpos;
3266
3267 if (ismbchar(*d)) {
3268 int len = mbclen(*d) - 1;
3269 range-=len, startpos+=len;
3270 if (!range)
3271 break;
3272 }
3273 range--, startpos++;
3274 }
3275 else {
3276 range++, startpos--;
3277 {
3278 const char *s, *d, *p;
3279
3280 s = string; d = string + startpos;
3281 for (p = d; p-- > s && ismbchar(*p); )
3282
3283
3284 ;
3285 if (!((d - p) & 1)) {
3286 if (!range)
3287 break;
3288 range++, startpos--;
3289 }
3290 }
3291 }
3292 }
3293 return -1;
3294 }
3295
3296
3297
3298
3299
3300
3301
3302
3303 #define IS_ACTIVE(R) ((R).bits.is_active)
3304 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3305
3306
3307
3308
3309
3310 #define NUM_REG_ITEMS 3
3311
3312
3313 #define NUM_COUNT_ITEMS 2
3314
3315
3316 #define NUM_NONREG_ITEMS 4
3317
3318
3319
3320
3321 #define MAX_NUM_FAILURE_ITEMS (num_regs * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
3322
3323
3324 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + NUM_NONREG_ITEMS + 1)
3325
3326
3327 #define PUSH_FAILURE_COUNT(ptr) \
3328 do { \
3329 int c; \
3330 EXTRACT_NUMBER(c, ptr); \
3331 ENSURE_FAIL_STACK(NUM_COUNT_ITEMS); \
3332 *stackp++ = (unsigned char*)(long)c; \
3333 *stackp++ = (ptr); \
3334 num_failure_counts++; \
3335 } while (0)
3336
3337
3338
3339
3340 #define PUSH_FAILURE_POINT(pattern_place, string_place) \
3341 do { \
3342 long last_used_reg, this_reg; \
3343 \
3344
3345 \
3346 for (last_used_reg = num_regs-1; last_used_reg > 0; last_used_reg--)\
3347 if (!REG_UNSET(regstart[last_used_reg])) \
3348 break; \
3349 \
3350 ENSURE_FAIL_STACK(NUM_FAILURE_ITEMS); \
3351 *stackp++ = (unsigned char*)(long)num_failure_counts; \
3352 num_failure_counts = 0; \
3353 \
3354 \
3355 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) { \
3356 *stackp++ = regstart[this_reg]; \
3357 *stackp++ = regend[this_reg]; \
3358 *stackp++ = reg_info[this_reg].word; \
3359 } \
3360 \
3361 \
3362 *stackp++ = (unsigned char*)last_used_reg; \
3363 \
3364 *stackp++ = pattern_place; \
3365 *stackp++ = string_place; \
3366 *stackp++ = (unsigned char*)(long)options; \
3367 *stackp++ = (unsigned char*)0; \
3368 } while(0)
3369
3370 #define NON_GREEDY ((unsigned char*)1)
3371
3372 #define POP_FAILURE_COUNT() \
3373 do { \
3374 unsigned char *ptr = *--stackp; \
3375 int count = (long)*--stackp; \
3376 STORE_NUMBER(ptr, count); \
3377 } while (0)
3378
3379
3380
3381 #define POP_FAILURE_POINT() \
3382 do { \
3383 long temp; \
3384 stackp -= NUM_NONREG_ITEMS; \
3385 temp = (long)*--stackp; \
3386 temp *= NUM_REG_ITEMS; \
3387 stackp -= temp; \
3388 temp = (long)*--stackp; \
3389 while (temp--) { \
3390 POP_FAILURE_COUNT(); \
3391 } \
3392 num_failure_counts = 0; \
3393 } while(0)
3394
3395
3396 #define REG_UNSET_VALUE ((unsigned char*)-1)
3397 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3398
3399 #define PREFETCH if (d == dend) goto fail
3400
3401
3402
3403
3404 #define SET_REGS_MATCHED \
3405 do { unsigned this_reg; \
3406 for (this_reg = 0; this_reg < num_regs; this_reg++) { \
3407 if (IS_ACTIVE(reg_info[this_reg])) \
3408 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
3409 else \
3410 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
3411 } \
3412 } while(0)
3413
3414 #define AT_STRINGS_BEG(d) ((d) == string)
3415 #define AT_STRINGS_END(d) ((d) == dend)
3416
3417 #define IS_A_LETTER(d) (SYNTAX(*(d)) == Sword || \
3418 (current_mbctype ? \
3419 (re_mbctab[*(d)] && ((d)+mbclen(*(d)))<=dend): \
3420 SYNTAX(*(d)) == Sword2))
3421
3422 #define PREV_IS_A_LETTER(d) ((current_mbctype == MBCTYPE_SJIS)? \
3423 IS_A_LETTER((d)-(!AT_STRINGS_BEG((d)-1)&& \
3424 ismbchar((d)[-2])?2:1)): \
3425 ((current_mbctype && ((d)[-1] >= 0x80)) || \
3426 IS_A_LETTER((d)-1)))
3427
3428 static void
3429 init_regs(regs, num_regs)
3430 struct re_registers *regs;
3431 unsigned int num_regs;
3432 {
3433 int i;
3434
3435 regs->num_regs = num_regs;
3436 if (num_regs < RE_NREGS)
3437 num_regs = RE_NREGS;
3438
3439 if (regs->allocated == 0) {
3440 regs->beg = TMALLOC(num_regs, int);
3441 regs->end = TMALLOC(num_regs, int);
3442 regs->allocated = num_regs;
3443 }
3444 else if (regs->allocated < num_regs) {
3445 TREALLOC(regs->beg, num_regs, int);
3446 TREALLOC(regs->end, num_regs, int);
3447 regs->allocated = num_regs;
3448 }
3449 for (i=0; i<num_regs; i++) {
3450 regs->beg[i] = regs->end[i] = -1;
3451 }
3452 }
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469 int
3470 re_match(bufp, string_arg, size, pos, regs)
3471 struct re_pattern_buffer *bufp;
3472 const char *string_arg;
3473 int size, pos;
3474 struct re_registers *regs;
3475 {
3476 register unsigned char *p = (unsigned char*)bufp->buffer;
3477 unsigned char *p1;
3478
3479
3480 register unsigned char *pend = p + bufp->used;
3481
3482 unsigned num_regs = bufp->re_nsub;
3483
3484 unsigned char *string = (unsigned char*)string_arg;
3485
3486 register unsigned char *d, *dend;
3487 register int mcnt;
3488 int options = bufp->options;
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500 unsigned char **stacka;
3501 unsigned char **stackb;
3502 unsigned char **stackp;
3503 unsigned char **stacke;
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513 unsigned char **regstart = bufp->regstart;
3514 unsigned char **regend = bufp->regend;
3515
3516
3517
3518
3519
3520
3521 unsigned char **old_regstart = bufp->old_regstart;
3522 unsigned char **old_regend = bufp->old_regend;
3523
3524
3525
3526
3527
3528
3529
3530
3531 register_info_type *reg_info = bufp->reg_info;
3532
3533
3534
3535
3536
3537
3538 unsigned best_regs_set = 0;
3539 unsigned char **best_regstart = bufp->best_regstart;
3540 unsigned char **best_regend = bufp->best_regend;
3541
3542 int num_failure_counts = 0;
3543
3544 if (regs) {
3545 init_regs(regs, num_regs);
3546 }
3547
3548
3549 stacka = RE_TALLOC(MAX_NUM_FAILURE_ITEMS * NFAILURES, unsigned char*);
3550 stackb = stacka;
3551 stackp = stackb;
3552 stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
3553
3554 #ifdef DEBUG_REGEX
3555 fprintf(stderr, "Entering re_match(%s)\n", string_arg);
3556 #endif
3557
3558
3559
3560
3561
3562 for (mcnt = 0; mcnt < num_regs; mcnt++) {
3563 regstart[mcnt] = regend[mcnt]
3564 = old_regstart[mcnt] = old_regend[mcnt]
3565 = best_regstart[mcnt] = best_regend[mcnt] = REG_UNSET_VALUE;
3566 #ifdef __CHECKER__
3567 reg_info[mcnt].word = 0;
3568 #endif
3569 IS_ACTIVE (reg_info[mcnt]) = 0;
3570 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3571 }
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584 d = string + pos, dend = string + size;
3585
3586
3587
3588
3589
3590 for (;;) {
3591 #ifdef DEBUG_REGEX
3592 fprintf(stderr,
3593 "regex loop(%d): matching 0x%02d\n",
3594 p - (unsigned char*)bufp->buffer,
3595 *p);
3596 #endif
3597
3598 if (p == pend) {
3599
3600 if ((bufp->options & RE_OPTION_LONGEST) && d != dend) {
3601 if (best_regs_set)
3602 goto restore_best_regs;
3603 while (stackp != stackb && stackp[-1] == NON_GREEDY) {
3604 if (best_regs_set)
3605 goto restore_best_regs;
3606 POP_FAILURE_POINT();
3607 }
3608 if (stackp != stackb) {
3609
3610
3611
3612 if (! best_regs_set || (d > best_regend[0])) {
3613 best_regs_set = 1;
3614 best_regend[0] = d;
3615
3616 for (mcnt = 1; mcnt < num_regs; mcnt++) {
3617 best_regstart[mcnt] = regstart[mcnt];
3618 best_regend[mcnt] = regend[mcnt];
3619 }
3620 }
3621 goto fail;
3622 }
3623
3624 else if (best_regs_set) {
3625 restore_best_regs:
3626
3627 d = best_regend[0];
3628
3629 for (mcnt = 0; mcnt < num_regs; mcnt++) {
3630 regstart[mcnt] = best_regstart[mcnt];
3631 regend[mcnt] = best_regend[mcnt];
3632 }
3633 }
3634 }
3635
3636
3637
3638 if (regs) {
3639 regs->beg[0] = pos;
3640 regs->end[0] = d - string;
3641 for (mcnt = 1; mcnt < num_regs; mcnt++) {
3642 if (REG_UNSET(regend[mcnt])) {
3643 regs->beg[mcnt] = -1;
3644 regs->end[mcnt] = -1;
3645 continue;
3646 }
3647 regs->beg[mcnt] = regstart[mcnt] - string;
3648 regs->end[mcnt] = regend[mcnt] - string;
3649 }
3650 }
3651 FREE_AND_RETURN(stackb, (d - pos - string));
3652 }
3653
3654
3655 #ifdef SWITCH_ENUM_BUG
3656 switch ((int)((enum regexpcode)*p++))
3657 #else
3658 switch ((enum regexpcode)*p++)
3659 #endif
3660 {
3661
3662
3663
3664
3665 case start_memory:
3666 old_regstart[*p] = regstart[*p];
3667 regstart[*p] = d;
3668 IS_ACTIVE(reg_info[*p]) = 1;
3669 MATCHED_SOMETHING(reg_info[*p]) = 0;
3670 p += 2;
3671 continue;
3672
3673 case stop_memory:
3674 old_regend[*p] = regend[*p];
3675 regend[*p] = d;
3676 IS_ACTIVE(reg_info[*p]) = 0;
3677 p += 2;
3678 continue;
3679
3680 case start_paren:
3681 case stop_paren:
3682 break;
3683
3684
3685
3686 case duplicate:
3687 {
3688 int regno = *p++;
3689 register unsigned char *d2, *dend2;
3690
3691
3692 if (regno >= num_regs) goto fail;
3693
3694 if (IS_ACTIVE(reg_info[regno])) goto fail;
3695
3696
3697 d2 = regstart[regno];
3698 if (REG_UNSET(d2)) goto fail;
3699
3700
3701
3702
3703
3704
3705 dend2 = regend[regno];
3706 if (REG_UNSET(dend2)) goto fail;
3707 for (;;) {
3708
3709 if (d2 == dend2) break;
3710
3711
3712 PREFETCH;
3713
3714
3715 mcnt = dend - d;
3716
3717
3718
3719 if (mcnt > dend2 - d2)
3720 mcnt = dend2 - d2;
3721
3722
3723
3724 if ((options & RE_OPTION_IGNORECASE)
3725 ? memcmp_translate(d, d2, mcnt)
3726 : memcmp((char*)d, (char*)d2, mcnt))
3727 goto fail;
3728 d += mcnt, d2 += mcnt;
3729 }
3730 }
3731 break;
3732
3733 case start_nowidth:
3734 PUSH_FAILURE_POINT(0, d);
3735 if (stackp - stackb > RE_DUP_MAX) {
3736 FREE_AND_RETURN(stackb,(-2));
3737 }
3738 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3739 STORE_NUMBER(p+mcnt, stackp - stackb);
3740 continue;
3741
3742 case stop_nowidth:
3743 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3744 stackp = stackb + mcnt;
3745 d = stackp[-3];
3746 POP_FAILURE_POINT();
3747 continue;
3748
3749 case stop_backtrack:
3750 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3751 stackp = stackb + mcnt;
3752 POP_FAILURE_POINT();
3753 continue;
3754
3755 case pop_and_fail:
3756 EXTRACT_NUMBER(mcnt, p+1);
3757 stackp = stackb + mcnt;
3758 POP_FAILURE_POINT();
3759 goto fail;
3760
3761 case anychar:
3762 PREFETCH;
3763 if (ismbchar(*d)) {
3764 if (d + mbclen(*d) > dend)
3765 goto fail;
3766 SET_REGS_MATCHED;
3767 d += mbclen(*d);
3768 break;
3769 }
3770 if (!(options&RE_OPTION_MULTILINE)
3771 && (TRANSLATE_P() ? translate[*d] : *d) == '\n')
3772 goto fail;
3773 SET_REGS_MATCHED;
3774 d++;
3775 break;
3776
3777 case anychar_repeat:
3778 for (;;) {
3779 PUSH_FAILURE_POINT(p, d);
3780 PREFETCH;
3781 if (ismbchar(*d)) {
3782 if (d + mbclen(*d) > dend)
3783 goto fail;
3784 SET_REGS_MATCHED;
3785 d += mbclen(*d);
3786 continue;
3787 }
3788 if (!(options&RE_OPTION_MULTILINE) &&
3789 (TRANSLATE_P() ? translate[*d] : *d) == '\n')
3790 goto fail;
3791 SET_REGS_MATCHED;
3792 d++;
3793 }
3794 break;
3795
3796 case charset:
3797 case charset_not:
3798 {
3799 int not;
3800 int part = 0;
3801 unsigned char *dsave = d + 1;
3802 int cc, c;
3803
3804 PREFETCH;
3805 cc = c = (unsigned char)*d++;
3806 if (ismbchar(c)) {
3807 if (d + mbclen(c) - 1 <= dend) {
3808 MBC2WC(c, d);
3809 }
3810 }
3811 else if (TRANSLATE_P())
3812 cc = c = (unsigned char)translate[c];
3813
3814 not = is_in_list(c, p);
3815 if (!not && cc != c) {
3816 part = not = is_in_list(cc, p);
3817 }
3818 if (*(p - 1) == (unsigned char)charset_not) {
3819 not = !not;
3820 }
3821 if (!not) goto fail;
3822
3823 p += 1 + *p + 2 + EXTRACT_UNSIGNED(&p[1 + *p])*8;
3824 SET_REGS_MATCHED;
3825
3826 if (part) d = dsave;
3827 break;
3828 }
3829
3830 case begline:
3831 if (size == 0 || AT_STRINGS_BEG(d))
3832 break;
3833 if (d[-1] == '\n' && !AT_STRINGS_END(d))
3834 break;
3835 goto fail;
3836
3837 case endline:
3838 if (AT_STRINGS_END(d)) {
3839 if (size == 0 || d[-1] != '\n')
3840 break;
3841 }
3842 else if (*d == '\n')
3843 break;
3844 goto fail;
3845
3846
3847 case begbuf:
3848 if (AT_STRINGS_BEG(d))
3849 break;
3850 goto fail;
3851
3852
3853 case endbuf:
3854 if (AT_STRINGS_END(d))
3855 break;
3856 goto fail;
3857
3858
3859 case endbuf2:
3860 if (AT_STRINGS_END(d)) {
3861 if (size == 0 || d[-1] != '\n')
3862 break;
3863 }
3864
3865 if (*d == '\n' && AT_STRINGS_END(d+1))
3866 break;
3867 goto fail;
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886 case begpos:
3887 if (d - string == pos)
3888 break;
3889 goto fail;
3890
3891 case on_failure_jump:
3892 on_failure:
3893 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3894 PUSH_FAILURE_POINT(p + mcnt, d);
3895 continue;
3896
3897
3898
3899 case maybe_finalize_jump:
3900 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3901 p1 = p;
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917 while (p1 + 2 < pend) {
3918 if ((enum regexpcode)*p1 == stop_memory ||
3919 (enum regexpcode)*p1 == start_memory)
3920 p1 += 3;
3921 else if (
3922 (enum regexpcode)*p1 == stop_paren)
3923 p1 += 1;
3924 else
3925 break;
3926 }
3927
3928 if (p1 == pend)
3929 p[-3] = (unsigned char)finalize_jump;
3930 else if (*p1 == (unsigned char)exactn ||
3931 *p1 == (unsigned char)endline) {
3932 register int c = *p1 == (unsigned char)endline ? '\n' : p1[2];
3933 register unsigned char *p2 = p + mcnt;
3934
3935
3936 if (p2[3] == (unsigned char)exactn && p2[5] != c)
3937 p[-3] = (unsigned char)finalize_jump;
3938 else if (p2[3] == (unsigned char)charset ||
3939 p2[3] == (unsigned char)charset_not) {
3940 int not;
3941 if (ismbchar(c)) {
3942 unsigned char *pp = p1+3;
3943 MBC2WC(c, pp);
3944 }
3945
3946
3947 not = is_in_list(c, p2 + 4);
3948 if (p2[3] == (unsigned char)charset_not)
3949 not = !not;
3950 if (!not)
3951 p[-3] = (unsigned char)finalize_jump;
3952 }
3953 }
3954 p -= 2;
3955 if (p[-1] != (unsigned char)finalize_jump) {
3956 p[-1] = (unsigned char)jump;
3957 goto nofinalize;
3958 }
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968 case finalize_jump:
3969 if (stackp > stackb && stackp[-3] == d) {
3970 p = stackp[-4];
3971 POP_FAILURE_POINT();
3972 continue;
3973 }
3974 POP_FAILURE_POINT();
3975
3976
3977
3978
3979 case jump_past_alt:
3980
3981
3982
3983 case jump:
3984 nofinalize:
3985 EXTRACT_NUMBER_AND_INCR(mcnt, p);
3986 if (mcnt < 0 && stackp > stackb && stackp[-3] == d)
3987 goto fail;
3988 p += mcnt;
3989 continue;
3990
3991 case dummy_failure_jump:
3992
3993
3994
3995
3996
3997 PUSH_FAILURE_POINT(0, 0);
3998 goto nofinalize;
3999
4000
4001
4002
4003
4004
4005 case push_dummy_failure:
4006
4007
4008 p1 = p;
4009
4010 while (p1 + 2 < pend) {
4011 if ((enum regexpcode)*p1 == stop_memory ||
4012 (enum regexpcode)*p1 == start_memory)
4013 p1 += 3;
4014 else if (
4015 (enum regexpcode)*p1 == stop_paren)
4016 p1 += 1;
4017 else
4018 break;
4019 }
4020 if ((enum regexpcode)*p1 == jump)
4021 p[-1] = unused;
4022 else
4023 PUSH_FAILURE_POINT(0, 0);
4024 break;
4025
4026
4027
4028 case succeed_n:
4029 EXTRACT_NUMBER(mcnt, p + 2);
4030
4031 if (mcnt != 0) {
4032 mcnt--;
4033 p += 2;
4034 PUSH_FAILURE_COUNT(p);
4035 STORE_NUMBER_AND_INCR(p, mcnt);
4036 PUSH_FAILURE_POINT(0, 0);
4037 }
4038 else {
4039 goto on_failure;
4040 }
4041 continue;
4042
4043 case jump_n:
4044 EXTRACT_NUMBER(mcnt, p + 2);
4045
4046 if (mcnt) {
4047 mcnt--;
4048 PUSH_FAILURE_COUNT(p + 2);
4049 STORE_NUMBER(p + 2, mcnt);
4050 goto nofinalize;
4051
4052 }
4053
4054 else
4055 p += 4;
4056 continue;
4057
4058 case set_number_at:
4059 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4060 p1 = p + mcnt;
4061 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4062 STORE_NUMBER(p1, mcnt);
4063 continue;
4064
4065 case try_next:
4066 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4067 if (p + mcnt < pend) {
4068 PUSH_FAILURE_POINT(p, d);
4069 stackp[-1] = NON_GREEDY;
4070 }
4071 p += mcnt;
4072 continue;
4073
4074 case finalize_push:
4075 POP_FAILURE_POINT();
4076 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4077 if (mcnt < 0 && stackp > stackb && stackp[-3] == d)
4078 goto fail;
4079 PUSH_FAILURE_POINT(p + mcnt, d);
4080 stackp[-1] = NON_GREEDY;
4081 continue;
4082
4083 case finalize_push_n:
4084 EXTRACT_NUMBER(mcnt, p + 2);
4085
4086 if (mcnt) {
4087 int pos, i;
4088
4089 mcnt--;
4090 STORE_NUMBER(p + 2, mcnt);
4091 EXTRACT_NUMBER(pos, p);
4092 EXTRACT_NUMBER(i, p+pos+5);
4093 if (i > 0) goto nofinalize;
4094 POP_FAILURE_POINT();
4095 EXTRACT_NUMBER_AND_INCR(mcnt, p);
4096 PUSH_FAILURE_POINT(p + mcnt, d);
4097 stackp[-1] = NON_GREEDY;
4098 p += 2;
4099 }
4100
4101 else
4102 p += 4;
4103 continue;
4104
4105
4106
4107 case unused:
4108 continue;
4109
4110 case casefold_on:
4111 options |= RE_OPTION_IGNORECASE;
4112 continue;
4113
4114 case casefold_off:
4115 options &= ~RE_OPTION_IGNORECASE;
4116 continue;
4117
4118 case option_set:
4119 options = *p++;
4120 continue;
4121
4122 case wordbound:
4123 if (AT_STRINGS_BEG(d)) {
4124 if (IS_A_LETTER(d)) break;
4125 else goto fail;
4126 }
4127 if (AT_STRINGS_END(d)) {
4128 if (PREV_IS_A_LETTER(d)) break;
4129 else goto fail;
4130 }
4131 if (PREV_IS_A_LETTER(d) != IS_A_LETTER(d))
4132 break;
4133 goto fail;
4134
4135 case notwordbound:
4136 if (AT_STRINGS_BEG(d)) {
4137 if (IS_A_LETTER(d)) goto fail;
4138 else break;
4139 }
4140 if (AT_STRINGS_END(d)) {
4141 if (PREV_IS_A_LETTER(d)) goto fail;
4142 else break;
4143 }
4144 if (PREV_IS_A_LETTER(d) != IS_A_LETTER(d))
4145 goto fail;
4146 break;
4147
4148 case wordbeg:
4149 if (IS_A_LETTER(d) && (AT_STRINGS_BEG(d) || !PREV_IS_A_LETTER(d)))
4150 break;
4151 goto fail;
4152
4153 case wordend:
4154 if (!AT_STRINGS_BEG(d) && PREV_IS_A_LETTER(d)
4155 && (!IS_A_LETTER(d) || AT_STRINGS_END(d)))
4156 break;
4157 goto fail;
4158
4159 case wordchar:
4160 PREFETCH;
4161 if (!IS_A_LETTER(d))
4162 goto fail;
4163 if (ismbchar(*d) && d + mbclen(*d) - 1 < dend)
4164 d += mbclen(*d) - 1;
4165 d++;
4166 SET_REGS_MATCHED;
4167 break;
4168
4169 case notwordchar:
4170 PREFETCH;
4171 if (IS_A_LETTER(d))
4172 goto fail;
4173 if (ismbchar(*d) && d + mbclen(*d) - 1 < dend)
4174 d += mbclen(*d) - 1;
4175 d++;
4176 SET_REGS_MATCHED;
4177 break;
4178
4179 case exactn:
4180
4181
4182 mcnt = *p++;
4183
4184
4185 if (TRANSLATE_P()) {
4186 do {
4187 unsigned char c;
4188
4189 PREFETCH;
4190 if (*p == 0xff) {
4191 p++;
4192 if (!--mcnt
4193 || AT_STRINGS_END(d)
4194 || (unsigned char)*d++ != (unsigned char)*p++)
4195 goto fail;
4196 continue;
4197 }
4198 c = *d++;
4199 if (ismbchar(c)) {
4200 int n;
4201
4202 if (c != (unsigned char)*p++)
4203 goto fail;
4204 for (n = mbclen(c) - 1; n > 0; n--)
4205 if (!--mcnt
4206
4207 || AT_STRINGS_END(d)
4208 || (unsigned char)*d++ != (unsigned char)*p++)
4209 goto fail;
4210 continue;
4211 }
4212
4213 if ((unsigned char)translate[c] != (unsigned char)translate[*p++])
4214 goto fail;
4215 }
4216 while (--mcnt);
4217 }
4218 else {
4219 do {
4220 PREFETCH;
4221 if (*p == 0xff) {p++; mcnt--;}
4222 if (*d++ != *p++) goto fail;
4223 }
4224 while (--mcnt);
4225 }
4226 SET_REGS_MATCHED;
4227 break;
4228 }
4229 #ifdef RUBY
4230 CHECK_INTS;
4231 #endif
4232 continue;
4233
4234
4235 fail:
4236 if (stackp != stackb) {
4237
4238 short last_used_reg, this_reg;
4239
4240
4241
4242 if (stackp[-4] == 0 || (best_regs_set && stackp[-1] == NON_GREEDY)) {
4243 POP_FAILURE_POINT();
4244 goto fail;
4245 }
4246 stackp--;
4247 options = (long)*--stackp;
4248 d = *--stackp;
4249 p = *--stackp;
4250
4251 last_used_reg = (long)*--stackp;
4252
4253
4254 for (this_reg = num_regs - 1; this_reg > last_used_reg; this_reg--) {
4255 regend[this_reg] = REG_UNSET_VALUE;
4256 regstart[this_reg] = REG_UNSET_VALUE;
4257 IS_ACTIVE(reg_info[this_reg]) = 0;
4258 MATCHED_SOMETHING(reg_info[this_reg]) = 0;
4259 }
4260
4261
4262 for ( ; this_reg > 0; this_reg--) {
4263 reg_info[this_reg].word = *--stackp;
4264 regend[this_reg] = *--stackp;
4265 regstart[this_reg] = *--stackp;
4266 }
4267 mcnt = (long)*--stackp;
4268 while (mcnt--) {
4269 POP_FAILURE_COUNT();
4270 }
4271 if (p < pend) {
4272 int is_a_jump_n = 0;
4273 int failed_paren = 0;
4274
4275 p1 = p;
4276
4277
4278 switch ((enum regexpcode)*p1) {
4279 case jump_n:
4280 case finalize_push_n:
4281 is_a_jump_n = 1;
4282 case maybe_finalize_jump:
4283 case finalize_jump:
4284 case finalize_push:
4285 case jump:
4286 p1++;
4287 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
4288
4289 if (mcnt >= 0) break;
4290 p1 += mcnt;
4291
4292 if (( is_a_jump_n && (enum regexpcode)*p1 == succeed_n) ||
4293 (!is_a_jump_n && (enum regexpcode)*p1 == on_failure_jump)) {
4294 if (failed_paren) {
4295 p1++;
4296 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
4297 PUSH_FAILURE_POINT(p1 + mcnt, d);
4298 }
4299 goto fail;
4300 }
4301 break;
4302 default:
4303 ;
4304 }
4305 }
4306 }
4307 else
4308 break;
4309 }
4310
4311 if (best_regs_set)
4312 goto restore_best_regs;
4313
4314 FREE_AND_RETURN(stackb,(-1));
4315 }
4316
4317
4318 static int
4319 memcmp_translate(s1, s2, len)
4320 unsigned char *s1, *s2;
4321 register int len;
4322 {
4323 register unsigned char *p1 = s1, *p2 = s2, c;
4324 while (len) {
4325 c = *p1++;
4326 if (ismbchar(c)) {
4327 int n;
4328
4329 if (c != *p2++) return 1;
4330 for (n = mbclen(c) - 1; n > 0; n--)
4331 if (!--len || *p1++ != *p2++)
4332 return 1;
4333 }
4334 else
4335 if (translate[c] != translate[*p2++])
4336 return 1;
4337 len--;
4338 }
4339 return 0;
4340 }
4341
4342 void
4343 re_copy_registers(regs1, regs2)
4344 struct re_registers *regs1, *regs2;
4345 {
4346 int i;
4347
4348 if (regs1 == regs2) return;
4349 if (regs1->allocated == 0) {
4350 regs1->beg = TMALLOC(regs2->num_regs, int);
4351 regs1->end = TMALLOC(regs2->num_regs, int);
4352 regs1->allocated = regs2->num_regs;
4353 }
4354 else if (regs1->allocated < regs2->num_regs) {
4355 TREALLOC(regs1->beg, regs2->num_regs, int);
4356 TREALLOC(regs1->end, regs2->num_regs, int);
4357 regs1->allocated = regs2->num_regs;
4358 }
4359 for (i=0; i<regs2->num_regs; i++) {
4360 regs1->beg[i] = regs2->beg[i];
4361 regs1->end[i] = regs2->end[i];
4362 }
4363 regs1->num_regs = regs2->num_regs;
4364 }
4365
4366 void
4367 re_free_registers(regs)
4368 struct re_registers *regs;
4369 {
4370 if (regs->allocated == 0) return;
4371 if (regs->beg) xfree(regs->beg);
4372 if (regs->end) xfree(regs->end);
4373 }
4374
4375
4376
4377
4378 static const unsigned char mbctab_ascii[] = {
4379 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4386 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4387 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4388 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4390 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4391 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4392 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4393 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4394 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4395 };
4396
4397 static const unsigned char mbctab_euc[] = {
4398 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4399 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4400 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4401 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4402 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4403 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4404 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4405 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4406 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,
4407 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4408 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4409 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4410 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4411 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4412 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4413 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
4414 };
4415
4416 static const unsigned char mbctab_sjis[] = {
4417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4424 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4425 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4426 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4431 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4432 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
4433 };
4434
4435 static const unsigned char mbctab_sjis_trail[] = {
4436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4440 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4441 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4442 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4443 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
4444 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4445 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4446 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4447 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4448 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4449 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4451 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
4452 };
4453
4454 static const unsigned char mbctab_utf8[] = {
4455 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4467 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4468 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4469 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
4470 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0,
4471 };
4472
4473 const unsigned char *re_mbctab = mbctab_ascii;
4474
4475 void
4476 re_mbcinit(mbctype)
4477 int mbctype;
4478 {
4479 switch (mbctype) {
4480 case MBCTYPE_ASCII:
4481 re_mbctab = mbctab_ascii;
4482 current_mbctype = MBCTYPE_ASCII;
4483 break;
4484 case MBCTYPE_EUC:
4485 re_mbctab = mbctab_euc;
4486 current_mbctype = MBCTYPE_EUC;
4487 break;
4488 case MBCTYPE_SJIS:
4489 re_mbctab = mbctab_sjis;
4490 current_mbctype = MBCTYPE_SJIS;
4491 break;
4492 case MBCTYPE_UTF8:
4493 re_mbctab = mbctab_utf8;
4494 current_mbctype = MBCTYPE_UTF8;
4495 break;
4496 }
4497 }
4498
4499 #define mbc_isfirst(t, c) (t)[(unsigned char)(c)]
4500 #define mbc_len(t, c) ((t)[(unsigned char)(c)]+1)
4501
4502 static unsigned int
4503 asc_startpos(string, pos)
4504 const char *string;
4505 unsigned int pos;
4506 {
4507 return pos;
4508 }
4509
4510 #define euc_islead(c) ((unsigned char)((c) - 0xa1) > 0xfe - 0xa1)
4511 #define euc_mbclen(c) mbc_len(mbctab_euc, (c))
4512 static unsigned int
4513 euc_startpos(string, pos)
4514 const char *string;
4515 unsigned int pos;
4516 {
4517 unsigned int i = pos, w;
4518
4519 while (i > 0 && !euc_islead(string[i])) {
4520 --i;
4521 }
4522 if (i == pos || i + (w = euc_mbclen(string[i])) > pos) {
4523 return i;
4524 }
4525 i += w;
4526 return i + ((pos - i) & ~1);
4527 }
4528
4529 #define sjis_isfirst(c) mbc_isfirst(mbctab_sjis, (c))
4530 #define sjis_istrail(c) mbctab_sjis_trail[(unsigned char)(c)]
4531 #define sjis_mbclen(c) mbc_len(mbctab_sjis, (c))
4532 static unsigned int
4533 sjis_startpos(string, pos)
4534 const char *string;
4535 unsigned int pos;
4536 {
4537 unsigned int i = pos, w;
4538
4539 if (i > 0 && sjis_istrail(string[i])) {
4540 do {
4541 if (!sjis_isfirst(string[--i])) {
4542 ++i;
4543 break;
4544 }
4545 } while (i > 0);
4546 }
4547 if (i == pos || i + (w = sjis_mbclen(string[i])) > pos) {
4548 return i;
4549 }
4550 i += w;
4551 return i + ((pos - i) & ~1);
4552 }
4553
4554 #define utf8_islead(c) ((unsigned char)((c) & 0xc0) != 0x80)
4555 #define utf8_mbclen(c) mbc_len(mbctab_utf8, (c))
4556 static unsigned int
4557 utf8_startpos(string, pos)
4558 const char *string;
4559 unsigned int pos;
4560 {
4561 unsigned int i = pos, w;
4562
4563 while (i > 0 && !utf8_islead(string[i])) {
4564 --i;
4565 }
4566 if (i == pos || i + (w = utf8_mbclen(string[i])) > pos) {
4567 return i;
4568 }
4569 return i + w;
4570 }
4571
4572
4573
4574
4575
4576
4577
4578
4579