Sorry for late reply. My implementation as follow: I modified the original algorithm, when cur state is root, using the 2-gram hash table to accelerate filtering. In following implementation, building dat table dependent on the Trie structure that had been builded.
References: http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=31365&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D31365 --------------------------------------- #include <stdio.h> #include <stdlib.h> #include "common.h" #include "my_dat_ac.h" #define T_SIZE 655360 #define printf(fmt,arg...) struct input_list { unsigned char node; struct bfs_list *next; }; enum { T_MUL_ST = 1, // multi-state T_SIN_ST, // sigle-state T_TER_ST, // terminal-state T_SEP_ST // separate-state }; struct _state_node { struct my_ac_patt * list; uint32_t fail; }; struct _state_build { struct input_list *valid_input_bfs; struct input_list *valid_input_bfs_last; uint16_t type; uint8_t from_char; struct _state_node state; }; struct _state_build build_new[T_SIZE] = {}; static uint32_t new[T_SIZE] = {}; static uint32_t base[T_SIZE] = {}; static uint32_t check[T_SIZE] = {}; static struct _state_node check_match[T_SIZE] = {}; static uint32_t old[T_SIZE] = {}; static uint32_t dat_hash[65536] = {}; static int my_input_enqueue(struct input_list **bfs, struct input_list **last, unsigned char n) { struct input_list *new; new = (struct input_list *) malloc(sizeof(struct input_list)); if(!new) { cli_errmsg("bfs_enqueue: Can't allocate memory for input_list\n"); return CL_EMEM; } new->next = NULL; new->node = n; if(*last) { (*last)->next = new; *last = new; } else { *bfs = *last = new; } return CL_SUCCESS; } static int my_input_dequeue(struct input_list **bfs, struct input_list **last) { struct input_list *lpt; int pt; if(!(lpt = *bfs)) { return -1; } else { *bfs = (*bfs)->next; pt = lpt->node; if(lpt == *last) *last = NULL; free(lpt); return pt; } } static int my_input_queue_foreach(struct input_list **bfs) { struct input_list *lpt; int pt; if(!(lpt = *bfs)) { return -1; } else { *bfs = (*bfs)->next; pt = lpt->node; return pt; } } extern struct bitmap_state *get_bit_popcnt_next_state(struct bitmap_state *pcur_state, unsigned char a); static int max_check_index = 0; int my_dat_ac_build(struct my_matcher *root) { int count1 =0; struct bfs_list *multi_bfs = NULL, *multi_bfs_last = NULL; struct bfs_list *sep_bfs = NULL, *sep_bfs_last = NULL; int res; uint8_t a; struct input_list *tmp; //init value------s : = 1 ; new(s) : = 1 ; queue := { s } ; int i; for(i=0;i<T_SIZE;i++) base[i] = 2; struct bitmap_state *bitmap_st = root->ac_root; struct bitmap_state *bitmap_st1; uint32_t s = bitmap_st->state_id , s1; new[s] = bitmap_st->state_id; old[bitmap_st->state_id] = s; //base[new[s]] = bitmap_st->state_id; my_bfs_enqueue(&multi_bfs, &multi_bfs_last, (void*)bitmap_st); /* Routine for multistates */ while((bitmap_st = my_bfs_dequeue(&multi_bfs, &multi_bfs_last)) != (void*)queue_NULL) { s = (uint32_t)bitmap_st->state_id; printf("-----M-----%03d(%03d)----------\n", s, new[s]); /* Let s be the next state in queue */ abcd: tmp = build_new[s].valid_input_bfs; while((res = my_input_queue_foreach(&tmp)) != -1) { a = (uint8_t)res; if(check[base[new[s]] + a] != 0){ base[new[s]] = base[new[s]] + 1; goto abcd; } } while((res = my_input_dequeue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last)) != -1) { /*for each a in list[s]*/ a = (uint8_t)res; // while(check[base[new[s]] + a] != 0){ // base[new[s]] = base[new[s]] + 1; // } check[base[new[s]] + a] = new[s]; bitmap_st1 = get_bit_popcnt_next_state(bitmap_st, a); s1 = bitmap_st1->state_id; new[s1] = base[new[s]] + a; old[base[new[s]] + a] = s1; if(new[s1] > max_check_index){ max_check_index = new[s1]; } printf("o_stat %03d -%02x- new_stat %03d;;;;%d\n", s1, a, new[s1], ++count1); //build hash if(bitmap_st1->level == 2){ uint8_t hash_str[2]; hash_str[0] = build_new[s].from_char; hash_str[1] = build_new[s1].from_char; uint16_t hash_val = *(uint16_t*)hash_str; dat_hash[hash_val] = new[s1]; } // check_match[new[s1]].fail = new[build_new[s1].state.fail]; // check_match[new[s1]].list = build_new[s1].state.list; if(build_new[s1].type != T_SEP_ST){ my_bfs_enqueue(&multi_bfs, &multi_bfs_last, (void*)bitmap_st1); } else { my_bfs_enqueue(&sep_bfs, &sep_bfs_last, (void*)bitmap_st1); } } } /* Routine for single state */ while((bitmap_st = my_bfs_dequeue(&sep_bfs, &sep_bfs_last)) != queue_NULL) { s = (uint32_t)bitmap_st->state_id; printf("-----S-----%03d(%03d)----------\n", s, new[s]); while(build_new[s].type != T_TER_ST){ a = (uint8_t)my_input_dequeue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last); while(check[base[new[s]] + a] != 0){ base[new[s]] = base[new[s]] + 1; } check[base[new[s]] + a] = new[s]; bitmap_st1 = get_bit_popcnt_next_state(bitmap_st, a); s1 = bitmap_st1->state_id; new[s1] = base[new[s]] + a; old[base[new[s]] + a] = s1; if(new[s1] > max_check_index){ max_check_index = new[s1]; } //build hash if(bitmap_st1->level == 2){ uint8_t hash_str[2]; hash_str[0] = build_new[s].from_char; hash_str[1] = build_new[s1].from_char; uint16_t hash_val = *(uint16_t*)hash_str; dat_hash[hash_val] = new[s1]; } printf("o_stat %03d -%02x- new_stat %03d;;;;%d\n", s1, a, new[s1], ++count1); // check_match[new[s1]].fail = new[build_new[s1].state.fail]; // check_match[new[s1]].list = build_new[s1].state.list; s = s1; bitmap_st = bitmap_st1; } } printf("max_check_index %d\n", max_check_index); for(i = 0;i<=max_check_index;i++) { check_match[i].fail = new[build_new[old[i]].state.fail]; check_match[i].list = build_new[old[i]].state.list; } return 0; } static unsigned int fail_count[32] = {}; uint32_t my_dat_ac_fail_proc(unsigned char a, uint32_t s) { #if 0 // return 1; // uint32_t t; // s = check_match[s].fail; // t = a + base[s]; // if(check[t] != s) // return 1; // else // return t; static int tdfsdfasdf = 1; tdfsdfasdf++; return 1; #else uint32_t t; int i = 0; do{ fail_count[i++]++; if(s == 1) return s; s = check_match[s].fail; t = a + base[s]; }while(check[t] != s); return t; #endif } static uint32_t st_hash_fail_count = 0; static uint32_t st_hash_ref_count = 0; uint32_t my_dat_ac_fail_proc_hash(uint32_t s, uint8_t *buffer, uint32_t *i, uint32_t length) { uint32_t t; int k = 0; uint32_t j; do{ fail_count[k++]++; if(s == 1){//root loop: j = *i; //hash search for(;j<length-1;j++) { st_hash_ref_count++; if(dat_hash[*(uint16_t*)(buffer+j)]){ *i = j+1; return dat_hash[*(uint16_t*)(buffer+j)]; } st_hash_fail_count++; } *i = j+1; return 0; } s = check_match[s].fail; if(s == 1) goto loop; t = buffer[*i] + base[s]; }while(check[t] != s); return t; } int my_dat_ac_scanbuff(const unsigned char *buffer, uint32_t length,const struct my_matcher *root) { int i; // for(i = 1;i< root->ac_nodes+1;i++) // { // //printf("old s %03d 's type is %d\n", i, build_new[i].type); // printf("old s %03d new_stat is %d\n", i, new[i]); // } // for(i = 0;i< T_SIZE;i++) // { // if(check_match[i].fail == 0) // printf("stat %05d fail null[%d]\n", old[i], i); // } // fprintf(stdout, "trans num: %d\n", my_ac_trans_sum(root)); //return 0; memset((void*)fail_count, 0x00, sizeof(fail_count)); uint32_t s = 1, t; printf("my_dat_ac_scanbuff\n", i, build_new[i].type); int count = 0; for (i = 0; i < length; i++) { t = buffer[i] + base[s]; if(check[t] != s){ //fail //t = my_dat_ac_fail_proc(buffer[i], s); t = my_dat_ac_fail_proc_hash(s, buffer, &i, length); //continue; } // if(s == 1) // st_hash_ref_count++; printf("%05d --%02x--> %05d\n", old[s], buffer[i], old[t]); s=t; if(check_match[s].list){ //final //fprintf(stdout, "hit:sigid %d:pos %d:\n", check_match[s].list->sigid, i); count++; } } // fprintf(stdout, "hit count %d, st_hash_fail_count %u st_hash_ref_count %u\n", count, st_hash_fail_count, st_hash_ref_count); // for(i = 0;i<8;i++) // fprintf(stdout, "%u,", fail_count[i]); // fprintf(stdout, "\n"); return s; } int my_dat_update_input(uint32_t s, unsigned char a, uint32_t t) { printf("id:%03d -- %02x -- id:%03d\n", s, a, t); my_input_enqueue(&build_new[s].valid_input_bfs, &build_new[s].valid_input_bfs_last, a); build_new[t].from_char = a; return 0; } int my_dat_update_patlist(uint32_t s, struct my_ac_patt *list) { build_new[s].state.list = list; return 0; } int my_dat_update_fail(uint32_t s,uint32_t fail) { build_new[s].state.fail = fail; return 0; } int my_dat_update_terminal(uint32_t s) { build_new[s].type = T_TER_ST; return 0; } int my_dat_update_type(struct bitmap_state **st_array, uint16_t array_len) { int i = array_len-1; if(build_new[st_array[i]->state_id].type !=0) return 0; int flag = 0; for(; i>=0;i--) { if(st_array[i]->sumary[_bitmap_group] > 1 || flag) { build_new[st_array[i]->state_id].type = T_MUL_ST; flag = 1; } else build_new[st_array[i]->state_id].type = T_SEP_ST; } return 0; } _______________________________________________ http://lurker.clamav.net/list/clamav-devel.html Please submit your patches to our Bugzilla: http://bugs.clamav.net