annotate regex/main.cc @ 38:d15b9d342421

add regex
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Sun, 01 Mar 2015 14:10:25 +0900
parents
children 120c8116e831
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
38
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
1 #include <stdio.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
2 #include <stdlib.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
3 #include <unistd.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
4 #include <math.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
5 #include <string.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
6 #include <sys/mman.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
7 #include <sys/stat.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
8 #include <sys/types.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
9 #include <fcntl.h>
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
10
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
11 const char *usr_help_str = "Usage: ./regex [-file filename] [-sw SearchWord]\n";
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
12
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
13 //Boyer Moore法に使用するテーブルを作成
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
14 int*
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
15 createBMskiptable(u_char *search_word,int search_word_len,int *skip_table)
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
16 {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
17 for(int i = 0; i < 256; ++i){
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
18 skip_table[i] = search_word_len;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
19 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
20
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
21 for(int j = 0; j < search_word_len - 1; ++j){
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
22 skip_table[search_word[j]] = search_word_len - j - 1;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
23 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
24 return skip_table;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
25 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
26
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
27 //Boyer Moore法による文字列検索アルゴリズム
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
28 static int BMmethod(u_char *text,int textLen,
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
29 u_char *pattern,int swLen,int *skipTable)
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
30 {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
31 int i = swLen - 1;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
32 int matchCounter = 0;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
33
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
34 while ( i < textLen){
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
35 int j = swLen - 1;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
36 while (text[i] == pattern[j]){
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
37 if (j == 0){
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
38 matchCounter++;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
39 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
40 --i;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
41 --j;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
42 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
43 i = i + fmax(skipTable[text[i]],swLen - j);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
44 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
45 return matchCounter;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
46 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
47
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
48 int main(int argc, char* argv[]) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
49
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
50 char *filename = 0;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
51 // "u_char" is "unsigned char" in sys/types.h
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
52 u_char *searchWord = 0;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
53
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
54 // check argument
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
55 for (int i = 0; argv[i]; i++) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
56 if (strcmp(argv[i], "-file") == 0) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
57 filename = (char*)argv[i+1]; i++;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
58 }else if (strcmp(argv[i], "-sw") == 0) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
59 searchWord = (u_char*)argv[i+1]; i++;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
60 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
61 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
62
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
63 // prepare file read
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
64 if (filename == 0) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
65 puts(usr_help_str);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
66 exit(1);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
67 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
68
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
69 struct stat sb;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
70 long fd = 0;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
71 char *textfile;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
72
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
73 if ((fd=open(filename,O_RDONLY,0666))==0) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
74 fprintf(stderr,"can't open %s\n",filename);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
75 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
76
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
77 if (fstat(fd,&sb)) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
78 fprintf(stderr,"can't fstat %s\n",filename);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
79 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
80
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
81 textfile = (char*)malloc(sb.st_size);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
82 read(fd,textfile,sb.st_size);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
83
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
84 if (textfile == (char*)-1) {
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
85 fprintf(stderr,"Can't mmap file\n");
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
86 perror(NULL);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
87 exit(0);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
88 }
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
89
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
90 // prepare Boyer Moore Search
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
91
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
92 int *skipTable = (int*)malloc(256 * sizeof(int)); // 文字列に対応した table を用意
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
93 int searchWordLen = strlen((const char*)searchWord);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
94 int *BMskip_table = createBMskiptable(searchWord, searchWordLen, skipTable);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
95
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
96 int matchNum = BMmethod((u_char*)textfile,sb.st_size,searchWord,searchWordLen,BMskip_table);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
97
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
98 printf("sword: %s len: %d\n",searchWord,searchWordLen);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
99 printf("Match : %d\n",matchNum);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
100
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
101 free(textfile);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
102 free(skipTable);
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
103 return 0;
d15b9d342421 add regex
Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
parents:
diff changeset
104 }