1 /*
2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3 * All rights reserved
4 *
5 * "THE BEER-WARE LICENSE" (Revision 42):
6 * Sergey Lyubka wrote this file. As long as you retain this notice you
7 * can do whatever you want with this stuff. If we meet some day, and you think
8 * this stuff is worth it, you can buy me a beer in return.
9 */
10
11 /*
12 * Downloaded Sat Nov 5 17:43:06 CET 2011 at
13 * http://slre.sourceforge.net/1.0/slre.c
14 */
15
16 #ifdef SLRE_TEST
17 #include <stdio.h>
18 #include <assert.h>
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #else
23 #include <log.h>
24 #include <common.h>
25 #include <linux/ctype.h>
26 #endif /* SLRE_TEST */
27
28 #include <errno.h>
29
30 #include <slre.h>
31
32 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
33 STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
34
35 #ifdef SLRE_TEST
36 static struct {
37 const char *name;
38 int narg;
39 const char *flags;
40 } opcodes[] = {
41 {"END", 0, ""}, /* End of code block or program */
42 {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
43 {"ANY", 0, ""}, /* Match any character, "." */
44 {"EXACT", 2, "d"}, /* Match exact string */
45 {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
46 {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
47 {"OPEN ", 1, "i"}, /* Capture start, "(" */
48 {"CLOSE", 1, "i"}, /* Capture end, ")" */
49 {"BOL", 0, ""}, /* Beginning of string, "^" */
50 {"EOL", 0, ""}, /* End of string, "$" */
51 {"STAR", 1, "o"}, /* Match zero or more times "*" */
52 {"PLUS", 1, "o"}, /* Match one or more times, "+" */
53 {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
54 {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
55 {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
56 {"SPACE", 0, ""}, /* Match whitespace, "\s" */
57 {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
58 {"DIGIT", 0, ""} /* Match digit, "\d" */
59 };
60 #endif /* SLRE_TEST */
61
62 /*
63 * Commands and operands are all unsigned char (1 byte long). All code offsets
64 * are relative to current address, and positive (always point forward). Data
65 * offsets are absolute. Commands with operands:
66 *
67 * BRANCH offset1 offset2
68 * Try to match the code block that follows the BRANCH instruction
69 * (code block ends with END). If no match, try to match code block that
70 * starts at offset1. If either of these match, jump to offset2.
71 *
72 * EXACT data_offset data_length
73 * Try to match exact string. String is recorded in data section from
74 * data_offset, and has length data_length.
75 *
76 * OPEN capture_number
77 * CLOSE capture_number
78 * If the user have passed 'struct cap' array for captures, OPEN
79 * records the beginning of the matched substring (cap->ptr), CLOSE
80 * sets the length (cap->len) for respective capture_number.
81 *
82 * STAR code_offset
83 * PLUS code_offset
84 * QUEST code_offset
85 * *, +, ?, respectively. Try to gobble as much as possible from the
86 * matched buffer, until code block that follows these instructions
87 * matches. When the longest possible string is matched,
88 * jump to code_offset
89 *
90 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
91 */
92
93 static const char *meta_chars = "|.^$*+?()[\\";
94
95 #ifdef SLRE_TEST
96
97 static void
print_character_set(FILE * fp,const unsigned char * p,int len)98 print_character_set(FILE *fp, const unsigned char *p, int len)
99 {
100 int i;
101
102 for (i = 0; i < len; i++) {
103 if (i > 0)
104 (void) fputc(',', fp);
105 if (p[i] == 0) {
106 i++;
107 if (p[i] == 0)
108 (void) fprintf(fp, "\\x%02x", p[i]);
109 else
110 (void) fprintf(fp, "%s", opcodes[p[i]].name);
111 } else if (isprint(p[i])) {
112 (void) fputc(p[i], fp);
113 } else {
114 (void) fprintf(fp, "\\x%02x", p[i]);
115 }
116 }
117 }
118
119 void
slre_dump(const struct slre * r,FILE * fp)120 slre_dump(const struct slre *r, FILE *fp)
121 {
122 int i, j, ch, op, pc;
123
124 for (pc = 0; pc < r->code_size; pc++) {
125
126 op = r->code[pc];
127 (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
128
129 for (i = 0; opcodes[op].flags[i] != '\0'; i++)
130 switch (opcodes[op].flags[i]) {
131 case 'i':
132 (void) fprintf(fp, "%d ", r->code[pc + 1]);
133 pc++;
134 break;
135 case 'o':
136 (void) fprintf(fp, "%d ",
137 pc + r->code[pc + 1] - i);
138 pc++;
139 break;
140 case 'D':
141 print_character_set(fp, r->data +
142 r->code[pc + 1], r->code[pc + 2]);
143 pc += 2;
144 break;
145 case 'd':
146 (void) fputc('"', fp);
147 for (j = 0; j < r->code[pc + 2]; j++) {
148 ch = r->data[r->code[pc + 1] + j];
149 if (isprint(ch)) {
150 (void) fputc(ch, fp);
151 } else {
152 (void) fprintf(fp,
153 "\\x%02x", ch);
154 }
155 }
156 (void) fputc('"', fp);
157 pc += 2;
158 break;
159 }
160
161 (void) fputc('\n', fp);
162 }
163 }
164 #endif /* SLRE_TEST */
165
166 static void
set_jump_offset(struct slre * r,int pc,int offset)167 set_jump_offset(struct slre *r, int pc, int offset)
168 {
169 assert(offset < r->code_size);
170
171 if (r->code_size - offset > 0xff)
172 r->err_str = "Jump offset is too big";
173 else
174 r->code[pc] = (unsigned char) (r->code_size - offset);
175 }
176
177 static void
emit(struct slre * r,int code)178 emit(struct slre *r, int code)
179 {
180 if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
181 r->err_str = "RE is too long (code overflow)";
182 else
183 r->code[r->code_size++] = (unsigned char) code;
184 }
185
186 static void
store_char_in_data(struct slre * r,int ch)187 store_char_in_data(struct slre *r, int ch)
188 {
189 if (r->data_size >= (int) sizeof(r->data))
190 r->err_str = "RE is too long (data overflow)";
191 else
192 r->data[r->data_size++] = ch;
193 }
194
195 static void
exact(struct slre * r,const char ** re)196 exact(struct slre *r, const char **re)
197 {
198 int old_data_size = r->data_size;
199
200 while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
201 store_char_in_data(r, *(*re)++);
202
203 emit(r, EXACT);
204 emit(r, old_data_size);
205 emit(r, r->data_size - old_data_size);
206 }
207
208 static int
get_escape_char(const char ** re)209 get_escape_char(const char **re)
210 {
211 int res;
212
213 switch (*(*re)++) {
214 case 'n':
215 res = '\n';
216 break;
217 case 'r':
218 res = '\r';
219 break;
220 case 't':
221 res = '\t';
222 break;
223 case '0':
224 res = 0;
225 break;
226 case 'S':
227 res = NONSPACE << 8;
228 break;
229 case 's':
230 res = SPACE << 8;
231 break;
232 case 'd':
233 res = DIGIT << 8;
234 break;
235 default:
236 res = (*re)[-1];
237 break;
238 }
239
240 return res;
241 }
242
243 static void
anyof(struct slre * r,const char ** re)244 anyof(struct slre *r, const char **re)
245 {
246 int esc, old_data_size = r->data_size, op = ANYOF;
247
248 if (**re == '^') {
249 op = ANYBUT;
250 (*re)++;
251 }
252
253 while (**re != '\0')
254
255 switch (*(*re)++) {
256 case ']':
257 emit(r, op);
258 emit(r, old_data_size);
259 emit(r, r->data_size - old_data_size);
260 return;
261 /* NOTREACHED */
262 break;
263 case '\\':
264 esc = get_escape_char(re);
265 if ((esc & 0xff) == 0) {
266 store_char_in_data(r, 0);
267 store_char_in_data(r, esc >> 8);
268 } else {
269 store_char_in_data(r, esc);
270 }
271 break;
272 default:
273 store_char_in_data(r, (*re)[-1]);
274 break;
275 }
276
277 r->err_str = "No closing ']' bracket";
278 }
279
280 static void
relocate(struct slre * r,int begin,int shift)281 relocate(struct slre *r, int begin, int shift)
282 {
283 emit(r, END);
284 memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
285 r->code_size += shift;
286 }
287
288 static void
quantifier(struct slre * r,int prev,int op)289 quantifier(struct slre *r, int prev, int op)
290 {
291 if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
292 r->code[prev + 2]--;
293 emit(r, EXACT);
294 emit(r, r->code[prev + 1] + r->code[prev + 2]);
295 emit(r, 1);
296 prev = r->code_size - 3;
297 }
298 relocate(r, prev, 2);
299 r->code[prev] = op;
300 set_jump_offset(r, prev + 1, prev);
301 }
302
303 static void
exact_one_char(struct slre * r,int ch)304 exact_one_char(struct slre *r, int ch)
305 {
306 emit(r, EXACT);
307 emit(r, r->data_size);
308 emit(r, 1);
309 store_char_in_data(r, ch);
310 }
311
312 static void
fixup_branch(struct slre * r,int fixup)313 fixup_branch(struct slre *r, int fixup)
314 {
315 if (fixup > 0) {
316 emit(r, END);
317 set_jump_offset(r, fixup, fixup - 2);
318 }
319 }
320
321 static void
compile(struct slre * r,const char ** re)322 compile(struct slre *r, const char **re)
323 {
324 int op, esc, branch_start, last_op, fixup, cap_no, level;
325
326 fixup = 0;
327 level = r->num_caps;
328 branch_start = last_op = r->code_size;
329
330 for (;;)
331 switch (*(*re)++) {
332 case '\0':
333 (*re)--;
334 return;
335 /* NOTREACHED */
336 break;
337 case '^':
338 emit(r, BOL);
339 break;
340 case '$':
341 emit(r, EOL);
342 break;
343 case '.':
344 last_op = r->code_size;
345 emit(r, ANY);
346 break;
347 case '[':
348 last_op = r->code_size;
349 anyof(r, re);
350 break;
351 case '\\':
352 last_op = r->code_size;
353 esc = get_escape_char(re);
354 if (esc & 0xff00)
355 emit(r, esc >> 8);
356 else
357 exact_one_char(r, esc);
358 break;
359 case '(':
360 last_op = r->code_size;
361 cap_no = ++r->num_caps;
362 emit(r, OPEN);
363 emit(r, cap_no);
364
365 compile(r, re);
366 if (*(*re)++ != ')') {
367 r->err_str = "No closing bracket";
368 return;
369 }
370
371 emit(r, CLOSE);
372 emit(r, cap_no);
373 break;
374 case ')':
375 (*re)--;
376 fixup_branch(r, fixup);
377 if (level == 0) {
378 r->err_str = "Unbalanced brackets";
379 return;
380 }
381 return;
382 /* NOTREACHED */
383 break;
384 case '+':
385 case '*':
386 op = (*re)[-1] == '*' ? STAR : PLUS;
387 if (**re == '?') {
388 (*re)++;
389 op = op == STAR ? STARQ : PLUSQ;
390 }
391 quantifier(r, last_op, op);
392 break;
393 case '?':
394 quantifier(r, last_op, QUEST);
395 break;
396 case '|':
397 fixup_branch(r, fixup);
398 relocate(r, branch_start, 3);
399 r->code[branch_start] = BRANCH;
400 set_jump_offset(r, branch_start + 1, branch_start);
401 fixup = branch_start + 2;
402 r->code[fixup] = 0xff;
403 break;
404 default:
405 (*re)--;
406 last_op = r->code_size;
407 exact(r, re);
408 break;
409 }
410 }
411
412 int
slre_compile(struct slre * r,const char * re)413 slre_compile(struct slre *r, const char *re)
414 {
415 r->err_str = NULL;
416 r->code_size = r->data_size = r->num_caps = r->anchored = 0;
417
418 if (*re == '^')
419 r->anchored++;
420
421 emit(r, OPEN); /* This will capture what matches full RE */
422 emit(r, 0);
423
424 while (*re != '\0')
425 compile(r, &re);
426
427 if (r->code[2] == BRANCH)
428 fixup_branch(r, 4);
429
430 emit(r, CLOSE);
431 emit(r, 0);
432 emit(r, END);
433
434 return (r->err_str == NULL ? 1 : 0);
435 }
436
437 static int match(const struct slre *, int,
438 const char *, int, int *, struct cap *);
439
440 static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)441 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
442 {
443 int saved_offset, matched_offset;
444
445 matched_offset = *ofs;
446
447 while (match(r, pc + 2, s, len, ofs, NULL)) {
448 saved_offset = *ofs;
449 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
450 matched_offset = saved_offset;
451 *ofs = saved_offset;
452 }
453
454 *ofs = matched_offset;
455 }
456
457 static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)458 loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
459 {
460 int saved_offset = *ofs;
461
462 while (match(r, pc + 2, s, len, ofs, NULL)) {
463 saved_offset = *ofs;
464 if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
465 break;
466 }
467
468 *ofs = saved_offset;
469 }
470
471 static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)472 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
473 {
474 int i, ch;
475
476 ch = s[*ofs];
477
478 for (i = 0; i < len; i++)
479 if (p[i] == ch) {
480 (*ofs)++;
481 return 1;
482 }
483
484 return 0;
485 }
486
487 static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)488 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
489 {
490 int i, ch;
491
492 ch = s[*ofs];
493
494 for (i = 0; i < len; i++) {
495 if (p[i] == ch)
496 return 0;
497 }
498
499 (*ofs)++;
500 return 1;
501 }
502
503 static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)504 match(const struct slre *r, int pc, const char *s, int len,
505 int *ofs, struct cap *caps)
506 {
507 int n, saved_offset, res = 1;
508
509 while (res && r->code[pc] != END) {
510
511 assert(pc < r->code_size);
512 assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
513
514 switch (r->code[pc]) {
515 case BRANCH:
516 saved_offset = *ofs;
517 res = match(r, pc + 3, s, len, ofs, caps);
518 if (res == 0) {
519 *ofs = saved_offset;
520 res = match(r, pc + r->code[pc + 1],
521 s, len, ofs, caps);
522 }
523 pc += r->code[pc + 2];
524 break;
525 case EXACT:
526 res = 0;
527 n = r->code[pc + 2]; /* String length */
528 if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
529 r->code[pc + 1], n)) {
530 (*ofs) += n;
531 res = 1;
532 }
533 pc += 3;
534 break;
535 case QUEST:
536 res = 1;
537 saved_offset = *ofs;
538 if (!match(r, pc + 2, s, len, ofs, caps))
539 *ofs = saved_offset;
540 pc += r->code[pc + 1];
541 break;
542 case STAR:
543 res = 1;
544 loop_greedy(r, pc, s, len, ofs);
545 pc += r->code[pc + 1];
546 break;
547 case STARQ:
548 res = 1;
549 loop_non_greedy(r, pc, s, len, ofs);
550 pc += r->code[pc + 1];
551 break;
552 case PLUS:
553 res = match(r, pc + 2, s, len, ofs, caps);
554 if (res == 0)
555 break;
556
557 loop_greedy(r, pc, s, len, ofs);
558 pc += r->code[pc + 1];
559 break;
560 case PLUSQ:
561 res = match(r, pc + 2, s, len, ofs, caps);
562 if (res == 0)
563 break;
564
565 loop_non_greedy(r, pc, s, len, ofs);
566 pc += r->code[pc + 1];
567 break;
568 case SPACE:
569 res = 0;
570 if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
571 (*ofs)++;
572 res = 1;
573 }
574 pc++;
575 break;
576 case NONSPACE:
577 res = 0;
578 if (*ofs < len &&
579 !isspace(((unsigned char *)s)[*ofs])) {
580 (*ofs)++;
581 res = 1;
582 }
583 pc++;
584 break;
585 case DIGIT:
586 res = 0;
587 if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
588 (*ofs)++;
589 res = 1;
590 }
591 pc++;
592 break;
593 case ANY:
594 res = 0;
595 if (*ofs < len) {
596 (*ofs)++;
597 res = 1;
598 }
599 pc++;
600 break;
601 case ANYOF:
602 res = 0;
603 if (*ofs < len)
604 res = is_any_of(r->data + r->code[pc + 1],
605 r->code[pc + 2], s, ofs);
606 pc += 3;
607 break;
608 case ANYBUT:
609 res = 0;
610 if (*ofs < len)
611 res = is_any_but(r->data + r->code[pc + 1],
612 r->code[pc + 2], s, ofs);
613 pc += 3;
614 break;
615 case BOL:
616 res = *ofs == 0 ? 1 : 0;
617 pc++;
618 break;
619 case EOL:
620 res = *ofs == len ? 1 : 0;
621 pc++;
622 break;
623 case OPEN:
624 if (caps != NULL)
625 caps[r->code[pc + 1]].ptr = s + *ofs;
626 pc += 2;
627 break;
628 case CLOSE:
629 if (caps != NULL)
630 caps[r->code[pc + 1]].len = (s + *ofs) -
631 caps[r->code[pc + 1]].ptr;
632 pc += 2;
633 break;
634 case END:
635 pc++;
636 break;
637 default:
638 printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
639 assert(0);
640 break;
641 }
642 }
643
644 return res;
645 }
646
647 int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)648 slre_match(const struct slre *r, const char *buf, int len,
649 struct cap *caps)
650 {
651 int i, ofs = 0, res = 0;
652
653 if (r->anchored) {
654 res = match(r, 0, buf, len, &ofs, caps);
655 } else {
656 for (i = 0; i < len && res == 0; i++) {
657 ofs = i;
658 res = match(r, 0, buf, len, &ofs, caps);
659 }
660 }
661
662 return res;
663 }
664
665 #ifdef SLRE_TEST
666 #define N_CAPS 5
667
main(int argc,char * argv[])668 int main(int argc, char *argv[])
669 {
670 struct slre slre;
671 struct cap caps[N_CAPS];
672 unsigned char data[1 * 1024 * 1024];
673 FILE *fp;
674 int i, res, len;
675
676 if (argc < 2) {
677 fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
678 return 1;
679 }
680
681 fp = fopen(argv[2], "rb");
682 if (fp == NULL) {
683 fprintf(stderr, "Error: cannot open %s:%s\n",
684 argv[2], strerror(errno));
685 return 1;
686 }
687
688 if (!slre_compile(&slre, argv[1])) {
689 fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
690 return 1;
691 }
692
693 slre_dump(&slre, stderr);
694
695 while (fgets(data, sizeof(data), fp) != NULL) {
696 len = strlen(data);
697
698 if ((len > 0) && (data[len-1] == '\n')) {
699 data[len-1] = '\0';
700 --len;
701 }
702
703 printf("Data = \"%s\"\n", data);
704
705 (void) memset(caps, 0, sizeof(caps));
706
707 res = slre_match(&slre, data, len, caps);
708 printf("Result [%d]: %d\n", i, res);
709
710 for (i = 0; i < N_CAPS; i++) {
711 if (caps[i].len > 0) {
712 printf("Substring %d: len=%d [%.*s]\n", i,
713 caps[i].len,
714 caps[i].len, caps[i].ptr);
715 }
716 }
717 printf("----------------------------------------------------\n");
718 }
719 (void) fclose(fp);
720
721 return 0;
722 }
723 #endif /* SLRE_TEST */
724