From 2314a195862243e09c485a66194866517a6f8c31 Mon Sep 17 00:00:00 2001 From: Daniel Beecham Date: Sun, 20 Mar 2022 02:04:18 +0100 Subject: [PATCH] feat: Rewrite the trie example (#927) The trie example had some issues; * It did not follow the code convention in CONTRIBUTING.md * The createTrieNode used an inefficient zeroing method (looping over the entries) which also does not zero out holes in the structure (e.g. an alternative would be to use "*node = &(TrieNode){0}", but calloc does all that anyway * It used an inefficient and clumsy printArray method * It used strlen inside the algorithm; this new method could get rid of any strlen/strnlen usage (inserts/searches could be sanitized by snprintf) * This version can allow for a custom mapping function, e.g. if NULL is a valid separator (say that you want a trie for certain binary packages) * The previous version actually contained out-of-bounds array indexing; there were no checks for out-of-bound indexing and words in the word list did contain out of bounds words. It's a surprise it was working so well. * This version just returns 'int' to allow for error checks (instead of a printf inside the algorithm), and uses double pointers for return values (good practice) * The usage example contained unnecessary mallocs, switched that out for scanf. The example is just an example after all, in real applications you'd have better input sanitazion. --- data_structures/trie/dictionary.txt | 26 +-- data_structures/trie/trie.c | 271 +++++++++++++++------------- 2 files changed, 146 insertions(+), 151 deletions(-) diff --git a/data_structures/trie/dictionary.txt b/data_structures/trie/dictionary.txt index 88a1145f..5fb37fc3 100644 --- a/data_structures/trie/dictionary.txt +++ b/data_structures/trie/dictionary.txt @@ -11398,7 +11398,6 @@ ancylostome ancylostomiasis ancyroid and -and/or anda andabata andabatarian @@ -23796,8 +23795,6 @@ azymous b bhoy bs -b/l -b/s ba baa baaed @@ -41542,10 +41539,6 @@ byzants bz c cs -c/d -c/f -c/m -c/o ca ca cacanny @@ -126581,7 +126574,6 @@ hailweed haily haimsucken hain -hain"t haint hainberry hainch @@ -128730,7 +128722,6 @@ hdlc hdqrs hdwe he -he"ll hed hell hes @@ -139010,7 +139001,6 @@ ill im is ive -i/c ia iago iamatology @@ -145956,7 +145946,6 @@ inpours inpush input inputs -input/output inputfile inputs inputted @@ -151770,7 +151759,6 @@ isuretine isuroid isz it -it"ll itd itll its @@ -155260,7 +155248,6 @@ kb kbar kbps kc -kc/s kcal kea keach @@ -156978,7 +156965,6 @@ klva klystron klystrons km -km/sec kmel kmet kmole @@ -158105,7 +158091,6 @@ lenvoy loeil ls ltre -l/w la laager laagered @@ -164745,8 +164730,6 @@ ller lloyds llyn lm -lm/ft -lm/m ln lndg lnr @@ -167403,7 +167386,6 @@ lyttas lyxose m ms -m/s ma maam maad @@ -185958,8 +185940,6 @@ n ngana nimporte ns -n/a -n/f na naa naam @@ -198979,8 +198959,6 @@ oclock oer oertop os -o/c -o/s oad oadal oaf @@ -345815,8 +345793,6 @@ vyingly vyrnwy w ws -w/ -w/o wa wa waac @@ -354932,4 +354908,4 @@ zymurgy zythem zythum zyzzyva -zyzzyvas \ No newline at end of file +zyzzyvas diff --git a/data_structures/trie/trie.c b/data_structures/trie/trie.c index 787ee1f5..78ab9d9d 100644 --- a/data_structures/trie/trie.c +++ b/data_structures/trie/trie.c @@ -3,6 +3,9 @@ /*-----character - 97 used for get the character from the ASCII value-----*/ +// needed for strnlen +#define _POSIX_C_SOURCE 200809L + #include #include #include @@ -11,177 +14,193 @@ #define ALPHABET_SIZE 26 /*--Node in the Trie--*/ -typedef struct TrieNode -{ - struct TrieNode *children[ALPHABET_SIZE]; - char character; - bool isEndOfWord; +struct trie { + struct trie *children[ALPHABET_SIZE]; + bool end_of_word; +}; -} TrieNode; -/*--Create new node--*/ -TrieNode *createTrieNode() +/*--Create new trie node--*/ +int trie_new ( + struct trie ** trie +) { - TrieNode *node; - node = malloc(sizeof(TrieNode)); - node->isEndOfWord = false; - int i = 0; - while (i < ALPHABET_SIZE) - { - node->children[i] = NULL; - i++; + *trie = calloc(1, sizeof(struct trie)); + if (NULL == *trie) { + // memory allocation failed + return -1; } - return node; + return 0; } + /*--Insert new word to Trie--*/ -void insert(TrieNode *root, char *word) +int trie_insert ( + struct trie * trie, + char *word, + unsigned word_len +) { - /*----Addition of the word done by recurcively----*/ + int ret = 0; - // Check wheather word character pointer is NULL - if ((strlen(word) - 1) != 0) - { - char character = *word; - if (root->children[character - 97] == NULL) - { - TrieNode *node = NULL; - node = createTrieNode(); - node->character = character; - root->children[character - 97] = node; + // this is the end of this word; add an end-of-word marker here and we're + // done. + if (0 == word_len) { + trie->end_of_word = true; + return 0; + } + + // if you have some more complex mapping, you could introduce one here. In + // this easy example, we just subtract 'a' (97) from it, meaning that 'a' is 0, + // 'b' is 1, and so on. + const unsigned int index = word[0] - 'a'; + + // this index is outside the alphabet size; indexing this would mean an + // out-of-bound memory access (bad!). If you introduce a separate map + // function for indexing, then you could move the out-of-bounds index in + // there. + if (ALPHABET_SIZE <= index) { + return -1; + } + + // The index does not exist yet, allocate it. + if (NULL == trie->children[index]) { + ret = trie_new(&trie->children[index]); + if (-1 == ret) { + // creating new trie node failed + return -1; } - word++; - insert(root->children[character - 97], word); } - else - { - root->isEndOfWord = true; - } - return; + + // recurse into the child node + return trie_insert( + /* trie = */ trie->children[index], + /* word = */ word + 1, + /* word_len = */ word_len - 1 + ); } + /*--Search a word in the Trie--*/ -TrieNode *search(TrieNode *root, char *word) +int trie_search( + struct trie * trie, + char *word, + unsigned word_len, + struct trie ** result +) { - TrieNode *temp; - while (*word != '\0') - { - char character = *word; - if (root->children[character - 97] != NULL) - { - temp = root->children[character - 97]; - word++; - root = temp; - } - else - { - printf("No possible words!!\n"); - return NULL; - } + // we found a match + if (0 == word_len) { + *result = trie; + return 0; } - return root; -} -/*---Print a word in the array--*/ -void printArray(char chars[], int len) -{ - int i; - for (i = 0; i < len; i++) - { - printf("%c", chars[i]); + // same here as in trie_insert, if you have a separate index mapping, add + // it here. In this example, we just subtract 'a'. + const unsigned int index = word[0] - 'a'; + + // This word contains letters outside the alphabet length; it's invalid. + // Remember to do this to prevent buffer overflows. + if (ALPHABET_SIZE <= index) { + return -1; } - printf("\n"); + + // No match + if (NULL == trie->children[index]) { + return -1; + } + + // traverse the trie + return trie_search( + /* trie = */ trie->children[index], + /* word = */ word + 1, + /* word_len = */ word_len - 1, + /* result = */ result + ); } /*---Return all the related words------*/ -void printPathsRecur(TrieNode *node, char prefix[], int filledLen) +void trie_print ( + struct trie * trie, + char prefix[], + unsigned prefix_len +) { - if (node == NULL) - return; - prefix[filledLen] = node->character; - filledLen++; - - if (node->isEndOfWord) - { - printArray(prefix, filledLen); + // An end-of-word marker means that this is a complete word, print it. + if (true == trie->end_of_word) { + printf("%.*s\n", prefix_len, prefix); } - int i; - for (i = 0; i < ALPHABET_SIZE; i++) - { - printPathsRecur(node->children[i], prefix, filledLen); + // However, there can be longer words with the same prefix; traverse into + // those as well. + for (int i = 0; i < ALPHABET_SIZE; i++) { + + // No words on this character + if (NULL == trie->children[i]) { + continue; + } + + // If you have a separate index mapping, then you'd need the inverse of + // the map here. Since we subtracted 'a' for the index, we can just add + // 'a' to get the inverse map function. + prefix[prefix_len] = i + 'a'; + + // traverse the print into the child + trie_print(trie->children[i], prefix, prefix_len + 1); } } -/*--Travel through the Trie and return words from it--*/ -void traverse(char prefix[], TrieNode *root) -{ - TrieNode *temp = NULL; - temp = search(root, prefix); - int j = 0; - while (prefix[j] != '\0') - { - j++; - } - printPathsRecur(temp, prefix, j - 1); -} /*------Demonstrate purposes uses text file called dictionary -------*/ -#define NUMBER_OF_WORDS (354935) -#define INPUT_WORD_SIZE (100) +int main() { + int ret = 0; + struct trie * root = NULL; + struct trie * trie = NULL; + char word[100] = {0}; -/*----Get input from the user------*/ -char *receiveInput(char *s) -{ - scanf("%99s", s); - return s; -} + // Create a root trie + ret = trie_new(&root); + if (-1 == ret) { + fprintf(stderr, "Could not create trie\n"); + exit(1); + } -int main() -{ - // Read the file dictionary - int word_count = 0; - char *words[NUMBER_OF_WORDS]; + // open the dictionary file FILE *fp = fopen("dictionary.txt", "r"); - - if (fp == 0) - { + if (NULL == fp) { fprintf(stderr, "Error while opening dictionary file"); exit(1); } - words[word_count] = malloc(INPUT_WORD_SIZE); - - while (fgets(words[word_count], INPUT_WORD_SIZE, fp)) - { - word_count++; - words[word_count] = malloc(INPUT_WORD_SIZE); + // insert all the words from the dictionary + while (1 == fscanf(fp, "%100s\n", word)) { + ret = trie_insert(root, word, strnlen(word, 100)); + if (-1 == ret) { + fprintf(stderr, "Could not insert word into trie\n"); + exit(1); + } } - // Push the words in to Trie - TrieNode *root = NULL; - root = createTrieNode(); - int i; - for (i = 0; i < NUMBER_OF_WORDS; i++) - { - insert(root, words[i]); - } - - while (1) - { + while (1) { printf("Enter keyword: "); - char str[100]; - receiveInput(str); + if (1 != scanf("%100s", word)) { + break; + } + printf( "\n==========================================================\n"); printf("\n********************* Possible Words ********************\n"); - // Find the word through the Trie - traverse(str, root); + ret = trie_search(root, word, strnlen(word, 100), &trie); + if (-1 == ret) { + printf("No results\n"); + continue; + } - printf( - "\n==========================================================\n"); + trie_print(trie, word, strnlen(word, 100)); + + printf("\n==========================================================\n"); } }