/* ----------------------------------------------------------------- */ /* Copyright (c) 2007 by Richard Harter */ /* */ /* Permission is hereby granted, free of charge, to any person */ /* obtaining a copy of this software and associated documentation */ /* files (the "Software"), to deal in the Software without */ /* restriction, including without limitation the rights to use, */ /* copy, modify, merge, publish, distribute, sublicense, and/or */ /* sell copies of the Software, and to permit persons to whom the */ /* Software is furnished to do so, subject to the following */ /* conditions: */ /* */ /* The above copyright notice and this permission notice shall be */ /* included in all copies or substantial portions of the */ /* Software. */ /* */ /* Derivative works shall include a notice that the software is a */ /* modified version of the copyrighted software. */ /* */ /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY */ /* KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE */ /* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR */ /* PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR */ /* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */ /* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR */ /* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* */ /* ----------------------------------------------------------------- */ /* Revision History */ /* */ /* April 22, 2007 - Base release 1.0.0 */ /* */ /* ----------------------------------------------------------------- */ #include #include #include #include "htree.h" #define U_INT32 unsigned long #define UCHAR unsigned char static HT_SIBS * double_sibs (HT_ROOT *, int, int, HT_SIBS *, HT_NODE **); static void free_node (HT_ROOT *, HT_NODE *); static int get_diff_pos (int, unsigned char *, unsigned char *); static HT_NODE * get_ht_node (HT_ROOT *); static HT_SIBS * get_ht_sibs (HT_ROOT *, int); static HT_BAGS * get_bag (HT_ROOT *, int, size_t); static HT_NODE * new_ht_node (HT_ROOT *, HT_DATA *); static HT_NODE * new_ht_pair (HT_ROOT *, int, HT_SIBS *, HT_DATA *,int); /**************************************************************/ /* External (public ) routines */ /* */ /* htree_init.......... Creates an H tree root node */ /* htree_close......... Deletes an H tree */ /* htree_find.......... Finds a data element in an H tree */ /* htree_add........... Adds a data element to an H tree */ /* htree_del........... Deletes a data element from an H tree */ /* htree_print_forest.. Prints all H trees for a root node */ /* htree_print_tree.... Prints an H tree for a given data size*/ /* htree_verify........ Verifies that two elements are equal */ /* */ /**************************************************************/ /**************************************************************/ /* */ /* htree_init: */ /* This routine creates an H tree master root node and */ /* initializes it. */ /* */ /**************************************************************/ HT_ROOT * htree_init() { HT_ROOT *root; int i; int sizes[N_SIB_FREELISTS] = {8,16,32,64,128,256}; root = malloc(sizeof (HT_ROOT)); if(!root){ printf("Malloc failed in htree_init\n"); exit(EXIT_FAILURE); } for (i=0;iroots[i] = 0; root->sib_bags_header = 0; for (i=0;isib_sizes [i] = sizes[i]; root->sib_freelists[i] = 0; root->sib_bags [i] = 0; } root->node_bags = 0; return root; } /**************************************************************/ /* */ /* htree_close: */ /* This routine closes an H tree. It deletes all storage */ /* for the tree. It returns 1 (true) if the tree was */ /* successfully closed and 0 (false) if it was not. */ /* */ /**************************************************************/ int htree_close(HT_ROOT *root) { HT_BAGS *bag; HT_BAGS *next; int i; if (!root) return 0; for (i=0;isib_bags[i];bag;bag = next) { next = bag->link; free (bag); } } for (bag=root->node_bags;bag;bag = next) { next = bag->link; free (bag); } return 1; } /**************************************************************/ /* */ /* htree_find: */ /* This routine searches an H tree for a data element. If it */ /* doesn't find it, it returns 0. If it does find it, it */ /* fills the identification field from the value stored in */ /* the data descriptor in the tree. */ /* */ /**************************************************************/ HT_DATA * htree_find(HT_ROOT * root, HT_DATA *datum) { HT_NODE * node; HT_SIBS * sib; HT_SIBS * sib_last; int selector; if (!datum || (datum->len > HT_MAX_SMALL_STRING_SIZE) || !(node = root->roots[datum->len])) return 0; for (;;) { selector = datum->data[node->index]; sib = node->sibs; sib_last = sib + node->num_sibs; sib_last->selector = selector; while (sib->selector != selector) sib++; if (sib == sib_last) return 0; if (!sib->kids) { if (htree_verify(datum->len,datum->data,sib->datum.data)) { datum->id = sib->datum.id; return datum; } return 0; } node = sib->kids; } return 0; } /**************************************************************/ /* */ /* htree_add: */ /* This routine adds a data element. First it searches to */ /* see if the element is already present. If it is, the */ /* attempt fails and 0 is returned. If it is not present, */ /* the element is added, the ID field is updated, and the */ /* updated data descriptor is returned. */ /* */ /**************************************************************/ HT_DATA * htree_add(HT_ROOT *root, HT_DATA *new_data,int id) { unsigned char *s; int selector; int len; int diff_pos = 0; int old_alloc; HT_NODE * node; HT_SIBS *sib; HT_SIBS *sib_last; if (!new_data || ((len = new_data->len) > HT_MAX_SMALL_STRING_SIZE)) return 0; s = new_data->data; node = root->roots[len]; if (!node) { new_data->id = id; node = new_ht_node(root, new_data); root->roots[len] = node; return new_data; } if (node->num_sibs == 1) { sib = node->sibs; diff_pos = get_diff_pos(len, sib->datum.data,s); if (diff_pos >= 0) { node->index = diff_pos; node->num_sibs = 2; sib++; sib->kids = 0; sib->datum.len = len; sib->datum.data = s; sib->datum.id = id; sib->selector = s[diff_pos]; new_data->id = id; return new_data; } else return 0; } for (;;) { selector = (int) s[node->index]; sib = node->sibs; sib_last = sib + node->num_sibs; sib_last->selector = selector; while (sib->selector != selector) sib++; if (sib == sib_last) { node->num_sibs++; if (node->num_sibs == node->max_num_sibs) { old_alloc = node->max_num_sibs; node->max_num_sibs *= 2; node->sibs = double_sibs (root, old_alloc,node->max_num_sibs, node->sibs,&root->roots[len]); sib = node->sibs + old_alloc - 1; sib->selector = selector; } sib->kids = 0; sib->datum.data = s; sib->datum.len = len; sib->datum.id = id; break; } else if (sib->kids) { node = sib->kids; continue; } else { diff_pos = get_diff_pos(len, sib->datum.data,s); if (diff_pos < 0) return 0; node = new_ht_pair(root,diff_pos,sib,new_data,id); sib->kids = node; break; } } new_data->id = id; return new_data; } /**************************************************************/ /* */ /* htree_del: */ /* This routine deletes a data element from the tree. It */ /* returns 1 (true) if the deletion was successful and 0 */ /* (false) if it was not. */ /* */ /* The delete algorithm is complex; here is a summary */ /* description: */ /* */ /* Step 0: Verify the legitimacy of the input. */ /* */ /* Step 1: Locate the first occurence of the datum to be */ /* deleted. The search will use the find algorithm */ /* but will check for an id match. If none is */ /* found we exit with a failed result. */ /* */ /* If one is found it is guaranteed that it will have */ /* at least one sibling. */ /* */ /* Step 2: We initiate a loop. The initial condition for the */ /* loop is that a variable called node points to the */ /* first node in which the item to be deleted occurs. */ /* The node will terminate when there is no longer */ /* any such node in the tree. */ /* */ /* Step 3: The first thing we do in the loop is to search down*/ /* in the tree from the node. There are three */ /* possibilities: (a) there are no descendents, (b) */ /* there is a sequence of singleton descendents that */ /* ends in no descendents, or (c) a sequence of zero */ /* or more singletons leading a node with at least */ /* two descendents. */ /* */ /* 3a: We delete the childless datum and return. */ /* 3b: We delete the datum and its chain of */ /* singletons and return. */ /* 3c: We find a sibling in the new node; we replace */ /* all references to the datum by its sibling */ /* in the chain leading from original node to the */ /* new node. */ /* */ /* Step 4: The new node found in step 3c replaces the */ /* orignal node in the loop; we find the right */ /* in the new node and then repeat the loop. */ /* */ /**************************************************************/ int htree_del(HT_ROOT * root, HT_DATA * datum) { HT_NODE * node; HT_NODE * p_node; HT_NODE * s_node; HT_SIBS * rep; HT_SIBS * sib; HT_SIBS * sav_sib; HT_SIBS * sib_last; int selector; int id; /* Checking data and tree validity */ if (!datum || (datum->len > HT_MAX_SMALL_STRING_SIZE) || !(node = root->roots[datum->len])) return 0; /* Searching for the first occurrence of the datum */ id = datum->id; for (;;) { selector = datum->data[node->index]; sib = node->sibs; sib_last = sib + node->num_sibs; sib_last->selector = selector; while (sib->selector != selector) sib++; if (sib == sib_last) return 0; if (sib->datum.id == datum->id) break; if (!sib->kids) return 0; node = sib->kids; } /* We reach this point if the datum is in the tree */ for (;;) { /* Here we search for the successor; call it s_node */ s_node = node; sav_sib = sib; for (;;) { p_node = s_node; s_node = sib->kids; if (!s_node) break; sib = s_node->sibs; if (s_node->num_sibs > 1) break; } /* If there is no successor, there may be a chain */ /* of singletons. Kill the one found in node and */ /* its singleton progeny. */ if (!s_node) { node->num_sibs--; sib = sav_sib; s_node = sib->kids; sib->kids = node->sibs[node->num_sibs].kids; sib->selector = node->sibs[node->num_sibs].selector; sib->datum.len = node->sibs[node->num_sibs].datum.len; sib->datum.data = node->sibs[node->num_sibs].datum.data; sib->datum.id = node->sibs[node->num_sibs].datum.id; while (s_node) { p_node = s_node->sibs->kids; free_node(root,s_node); s_node = p_node; } return 1; } /* We have a successor. Pick a sibling as a replacement */ /* and substitute it for the one to be deleted in the */ /* and all descendents down to but not including the */ /* successor node. */ rep = s_node->sibs + ((s_node->sibs->datum.id == id)? 1:0); for (p_node=node,sib=sav_sib; p_node != s_node;) { sib->datum.id = rep->datum.id; sib->datum.data = rep->datum.data; sib->datum.len = rep->datum.len; p_node = sib->kids; sib = p_node->sibs; } node = s_node; for (sav_sib=node->sibs;sav_sib->datum.id != id;sav_sib++); } return 0; } /**************************************************************/ /* */ /* htree_print_forest: */ /* This routine prints the full H tree. There is a different */ /* H tree for each data element length. */ /* */ /**************************************************************/ void htree_print_forest(FILE * ofptr, HT_ROOT * root) { int i; HT_NODE ** roots; roots = root->roots; for (i=0;inum_sibs,node->index); for (i = 0; i < node->num_sibs; i++) { s = node->sibs[i].datum.data; for (j=0;j<(level+1);j++) fprintf(ofptr,"%3c",' '); fprintf(ofptr,"(%d) %s\n", node->sibs[i].datum.id, node->sibs[i].datum.data); } for (i = 0; i < node->num_sibs; i++) { htree_print_tree(ofptr,level+1,i,node->sibs[i].kids); } } /**************************************************************/ /* */ /* htree_verify; */ /* This routine compares two data elements and reports */ /* 1 (true) if the are equal and 0 (false) if they are not. */ /* The code is optimized for the case where are equal. */ /* */ /**************************************************************/ int htree_verify(int len, unsigned char *s0, unsigned char *s1) { int i; int val = 0; unsigned int *b0; unsigned int *b1; b0 = (unsigned int *)s0; b1 = (unsigned int *)s1; for (i = len/8;i>0;i--) { val |= (*b0++ ^ *b1++); val |= (*b0++ ^ *b1++); } s0 = (unsigned char *)b0; s1 = (unsigned char *)b1; for (i = len&7; i>0;i--) { val |= (*s0++ ^ *s1++); } return !val; } /**************************************************************/ /* Internal (private) routines */ /* */ /* get_diff_pos...... For two strings, find a pos that diffs */ /* new_ht_node....... Gets a new node for a data item */ /* new_ht_pair....... Gets a new node for a pair of items */ /* double_sibs....... Increase the size of a siblings array */ /* get_ht_node....... Allocates space for an ht node */ /* get_ht_sibe....... Allocates space for siblings arrays */ /* free_node......... Moves a node and its sibs to freelists */ /* get_bag........... Gets a bag and allocates its space */ /* */ /**************************************************************/ /**************************************************************/ /* */ /* get_diff_pos: */ /* This routine compares two data elements and idnetifies */ /* a byte position at which they differ. Zero indexing is */ /* used. A value of -1 is returned if they are identical. */ /* */ /* This routine is similar to ht_verify; however the objective*/ /* is different. The current implemention finds the first */ /* byte at which the strings differ; this may not be the best */ /* strategy. */ /* */ /**************************************************************/ static int get_diff_pos(int len, unsigned char *s0, unsigned char *s1) { int i; for (i=0;iindex = 0; node->num_sibs = 1; sib = node->sibs; sib[0].kids = 0; sib[0].datum.len = dat->len; sib[0].datum.data = dat->data; sib[0].datum.id = dat->id; sib[0].selector = dat->data[0]; return node; } /**************************************************************/ /* */ /* new_ht_pair: */ /* This routine creates a new node containing a pair of */ /* siblings, one inherited from its parent sibling and one */ /* for a new datum. Get_ht_node handles space allocation; */ /* new_ht_pair takes care of initialization. */ /* */ /**************************************************************/ static HT_NODE * new_ht_pair(HT_ROOT *root, int pos, HT_SIBS * usib, HT_DATA * dat1, int id) { HT_NODE *node; HT_SIBS *sib; node = get_ht_node(root); node->index = pos; node->num_sibs = 2; sib = node->sibs; sib[0].kids = 0; sib[0].datum.len = usib->datum.len; sib[0].datum.data = usib->datum.data; sib[0].datum.id = usib->datum.id; sib[0].selector = usib->datum.data[pos]; sib[1].kids = 0; sib[1].datum.len = dat1->len; sib[1].datum.data = dat1->data; sib[1].datum.id = id; sib[1].selector = dat1->data[pos]; return node; } /**************************************************************/ /* */ /* double_sibs: */ /* This routine increases the size of the siblings array. */ /* */ /**************************************************************/ static HT_SIBS * double_sibs(HT_ROOT *root, int oldsize, int newsize, HT_SIBS *s, HT_NODE **nodeptr) { HT_SIBS *ss; ss = get_ht_sibs(root, newsize); memcpy(ss,s,oldsize*sizeof(HT_SIBS)); return ss; } /**************************************************************/ /* */ /* get_ht_node: */ /* This routine allocates space for a new node. It avoids */ /* excessive calls to malloc and free by using a freelist and */ /* bag scheme. */ /* */ /**************************************************************/ static HT_NODE * get_ht_node(HT_ROOT *root) { HT_NODE *node; HT_BAGS *bag; if (root->node_freelist) { node = root->node_freelist; root->node_freelist = root->node_freelist->link; } else { if (!root->node_bags || (root->node_bags->pos <= 0)) { bag = get_bag(root, BAG_SIZE, sizeof (HT_NODE)); bag->link = root->node_bags; root->node_bags = bag; } --(root->node_bags->pos); node = (HT_NODE *)root->node_bags->space; node += root->node_bags->pos; } node->max_num_sibs = root->sib_sizes[0]; node->num_sibs = 0; node->link = 0; node->sibs = get_ht_sibs(root, root->sib_sizes[0]); return node; } /**************************************************************/ /* */ /* get_ht_sibs: */ /* This routine allocates space for a new siblings array. */ /* It uses a freelist and bag scheme with separate freelists */ /* and bags for each distinct length. */ /* */ /**************************************************************/ static HT_SIBS * get_ht_sibs(HT_ROOT *root, int len) { HT_SIBS *sibs; HT_BAGS *bag; int i; for (i=0;(i root->sib_sizes[i]);i++) if (i >= N_SIB_FREELISTS) { fprintf(stderr, "Allocation error - requested sibling list size too large.\n"); exit(EXIT_FAILURE); } if (root->sib_freelists[i]) { sibs = root->sib_freelists[i]; root->sib_freelists[i] = sibs->link; } else { if (!root->sib_bags[i] || (root->sib_bags[i]->pos <= 0)) { bag = get_bag(root, BAG_SIZE, len*sizeof (HT_SIBS)); bag->link = root->sib_bags[i]; root->sib_bags[i] = bag; } --(root->sib_bags[i]->pos); sibs = (HT_SIBS *)root->sib_bags[i]->space; sibs += len*root->sib_bags[i]->pos; } return sibs; } /**************************************************************/ /* */ /* free_node: */ /* This routine adds a node to the node free list and its */ /* sib array to the appropriate sib free list. */ /* */ /**************************************************************/ static void free_node(HT_ROOT *root, HT_NODE * node) { int i; for (i=0;(imax_num_sibs > root->sib_sizes[i]) break; } if (i >= N_SIB_FREELISTS) { fprintf(stderr,"Illegal allocation length in a freed node.\n"); exit(EXIT_FAILURE); } node->sibs->link = root->sib_freelists[i]; root->sib_freelists[i] = node->sibs; node->link = root->node_freelist; root->node_freelist = node; } /**************************************************************/ /* */ /* get_bag: */ /* This routine allocates space for a new bag. The arguments */ /* are the number of elements in the bag and the size of an */ /* element. */ /* */ /**************************************************************/ static HT_BAGS * get_bag(HT_ROOT * root, int num, size_t size) { HT_BAGS *bag; bag = malloc(sizeof (HT_BAGS)); if (!bag) { fprintf(stderr,"Can't allocate space for a bag\n"); exit(EXIT_FAILURE); } bag->space = malloc(num *size); if (!bag->space) { fprintf(stderr,"Can't allocate space within a bag\n"); exit(EXIT_FAILURE); } bag->pos = num; bag->link = root->sib_bags_header; root->sib_bags_header = bag; return bag; }