feat: Rewrite the trie example (#927)

The trie example had some issues;

* It did not follow the code convention in CONTRIBUTING.md
* The createTrieNode used an inefficient zeroing method (looping over
  the entries) which also does not zero out holes in the structure (e.g.
  an alternative would be to use "*node = &(TrieNode){0}", but calloc
  does all that anyway
* It used an inefficient and clumsy printArray method
* It used strlen inside the algorithm; this new method could get rid of
  any strlen/strnlen usage (inserts/searches could be sanitized by
  snprintf)
* This version can allow for a custom mapping function, e.g. if NULL is
  a valid separator (say that you want a trie for certain binary
  packages)
* The previous version actually contained out-of-bounds array indexing;
  there were no checks for out-of-bound indexing and words in the word
  list did contain out of bounds words. It's a surprise it was working
  so well.
* This version just returns 'int' to allow for error checks (instead of
  a printf inside the algorithm), and uses double pointers for return
  values (good practice)
* The usage example contained unnecessary mallocs, switched that out for
  scanf. The example is just an example after all, in real applications
  you'd have better input sanitazion.
This commit is contained in:
Daniel Beecham 2022-03-20 02:04:18 +01:00 committed by GitHub
parent d017c3ef77
commit 2314a19586
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 146 additions and 151 deletions

View File

@ -11398,7 +11398,6 @@ ancylostome
ancylostomiasis ancylostomiasis
ancyroid ancyroid
and and
and/or
anda anda
andabata andabata
andabatarian andabatarian
@ -23796,8 +23795,6 @@ azymous
b b
bhoy bhoy
bs bs
b/l
b/s
ba ba
baa baa
baaed baaed
@ -41542,10 +41539,6 @@ byzants
bz bz
c c
cs cs
c/d
c/f
c/m
c/o
ca ca
ca ca
cacanny cacanny
@ -126581,7 +126574,6 @@ hailweed
haily haily
haimsucken haimsucken
hain hain
hain"t
haint haint
hainberry hainberry
hainch hainch
@ -128730,7 +128722,6 @@ hdlc
hdqrs hdqrs
hdwe hdwe
he he
he"ll
hed hed
hell hell
hes hes
@ -139010,7 +139001,6 @@ ill
im im
is is
ive ive
i/c
ia ia
iago iago
iamatology iamatology
@ -145956,7 +145946,6 @@ inpours
inpush inpush
input input
inputs inputs
input/output
inputfile inputfile
inputs inputs
inputted inputted
@ -151770,7 +151759,6 @@ isuretine
isuroid isuroid
isz isz
it it
it"ll
itd itd
itll itll
its its
@ -155260,7 +155248,6 @@ kb
kbar kbar
kbps kbps
kc kc
kc/s
kcal kcal
kea kea
keach keach
@ -156978,7 +156965,6 @@ klva
klystron klystron
klystrons klystrons
km km
km/sec
kmel kmel
kmet kmet
kmole kmole
@ -158105,7 +158091,6 @@ lenvoy
loeil loeil
ls ls
ltre ltre
l/w
la la
laager laager
laagered laagered
@ -164745,8 +164730,6 @@ ller
lloyds lloyds
llyn llyn
lm lm
lm/ft
lm/m
ln ln
lndg lndg
lnr lnr
@ -167403,7 +167386,6 @@ lyttas
lyxose lyxose
m m
ms ms
m/s
ma ma
maam maam
maad maad
@ -185958,8 +185940,6 @@ n
ngana ngana
nimporte nimporte
ns ns
n/a
n/f
na na
naa naa
naam naam
@ -198979,8 +198959,6 @@ oclock
oer oer
oertop oertop
os os
o/c
o/s
oad oad
oadal oadal
oaf oaf
@ -345815,8 +345793,6 @@ vyingly
vyrnwy vyrnwy
w w
ws ws
w/
w/o
wa wa
wa wa
waac waac

View File

@ -3,6 +3,9 @@
/*-----character - 97 used for get the character from the ASCII value-----*/ /*-----character - 97 used for get the character from the ASCII value-----*/
// needed for strnlen
#define _POSIX_C_SOURCE 200809L
#include <stdbool.h> #include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -11,177 +14,193 @@
#define ALPHABET_SIZE 26 #define ALPHABET_SIZE 26
/*--Node in the Trie--*/ /*--Node in the Trie--*/
typedef struct TrieNode struct trie {
{ struct trie *children[ALPHABET_SIZE];
struct TrieNode *children[ALPHABET_SIZE]; bool end_of_word;
char character; };
bool isEndOfWord;
} TrieNode;
/*--Create new node--*/ /*--Create new trie node--*/
TrieNode *createTrieNode() int trie_new (
struct trie ** trie
)
{ {
TrieNode *node; *trie = calloc(1, sizeof(struct trie));
node = malloc(sizeof(TrieNode)); if (NULL == *trie) {
node->isEndOfWord = false; // memory allocation failed
int i = 0; return -1;
while (i < ALPHABET_SIZE)
{
node->children[i] = NULL;
i++;
} }
return node; return 0;
} }
/*--Insert new word to Trie--*/ /*--Insert new word to Trie--*/
void insert(TrieNode *root, char *word) int trie_insert (
struct trie * trie,
char *word,
unsigned word_len
)
{ {
/*----Addition of the word done by recurcively----*/ int ret = 0;
// Check wheather word character pointer is NULL // this is the end of this word; add an end-of-word marker here and we're
if ((strlen(word) - 1) != 0) // done.
{ if (0 == word_len) {
char character = *word; trie->end_of_word = true;
if (root->children[character - 97] == NULL) return 0;
{ }
TrieNode *node = NULL;
node = createTrieNode(); // if you have some more complex mapping, you could introduce one here. In
node->character = character; // this easy example, we just subtract 'a' (97) from it, meaning that 'a' is 0,
root->children[character - 97] = node; // 'b' is 1, and so on.
const unsigned int index = word[0] - 'a';
// this index is outside the alphabet size; indexing this would mean an
// out-of-bound memory access (bad!). If you introduce a separate map
// function for indexing, then you could move the out-of-bounds index in
// there.
if (ALPHABET_SIZE <= index) {
return -1;
}
// The index does not exist yet, allocate it.
if (NULL == trie->children[index]) {
ret = trie_new(&trie->children[index]);
if (-1 == ret) {
// creating new trie node failed
return -1;
} }
word++;
insert(root->children[character - 97], word);
} }
else
{ // recurse into the child node
root->isEndOfWord = true; return trie_insert(
} /* trie = */ trie->children[index],
return; /* word = */ word + 1,
/* word_len = */ word_len - 1
);
} }
/*--Search a word in the Trie--*/ /*--Search a word in the Trie--*/
TrieNode *search(TrieNode *root, char *word) int trie_search(
struct trie * trie,
char *word,
unsigned word_len,
struct trie ** result
)
{ {
TrieNode *temp; // we found a match
while (*word != '\0') if (0 == word_len) {
{ *result = trie;
char character = *word; return 0;
if (root->children[character - 97] != NULL)
{
temp = root->children[character - 97];
word++;
root = temp;
}
else
{
printf("No possible words!!\n");
return NULL;
}
} }
return root;
}
/*---Print a word in the array--*/ // same here as in trie_insert, if you have a separate index mapping, add
void printArray(char chars[], int len) // it here. In this example, we just subtract 'a'.
{ const unsigned int index = word[0] - 'a';
int i;
for (i = 0; i < len; i++) // This word contains letters outside the alphabet length; it's invalid.
{ // Remember to do this to prevent buffer overflows.
printf("%c", chars[i]); if (ALPHABET_SIZE <= index) {
return -1;
} }
printf("\n");
// No match
if (NULL == trie->children[index]) {
return -1;
}
// traverse the trie
return trie_search(
/* trie = */ trie->children[index],
/* word = */ word + 1,
/* word_len = */ word_len - 1,
/* result = */ result
);
} }
/*---Return all the related words------*/ /*---Return all the related words------*/
void printPathsRecur(TrieNode *node, char prefix[], int filledLen) void trie_print (
struct trie * trie,
char prefix[],
unsigned prefix_len
)
{ {
if (node == NULL)
return;
prefix[filledLen] = node->character; // An end-of-word marker means that this is a complete word, print it.
filledLen++; if (true == trie->end_of_word) {
printf("%.*s\n", prefix_len, prefix);
if (node->isEndOfWord)
{
printArray(prefix, filledLen);
} }
int i; // However, there can be longer words with the same prefix; traverse into
for (i = 0; i < ALPHABET_SIZE; i++) // those as well.
{ for (int i = 0; i < ALPHABET_SIZE; i++) {
printPathsRecur(node->children[i], prefix, filledLen);
// No words on this character
if (NULL == trie->children[i]) {
continue;
}
// If you have a separate index mapping, then you'd need the inverse of
// the map here. Since we subtracted 'a' for the index, we can just add
// 'a' to get the inverse map function.
prefix[prefix_len] = i + 'a';
// traverse the print into the child
trie_print(trie->children[i], prefix, prefix_len + 1);
} }
} }
/*--Travel through the Trie and return words from it--*/
void traverse(char prefix[], TrieNode *root)
{
TrieNode *temp = NULL;
temp = search(root, prefix);
int j = 0;
while (prefix[j] != '\0')
{
j++;
}
printPathsRecur(temp, prefix, j - 1);
}
/*------Demonstrate purposes uses text file called dictionary -------*/ /*------Demonstrate purposes uses text file called dictionary -------*/
#define NUMBER_OF_WORDS (354935) int main() {
#define INPUT_WORD_SIZE (100) int ret = 0;
struct trie * root = NULL;
struct trie * trie = NULL;
char word[100] = {0};
/*----Get input from the user------*/ // Create a root trie
char *receiveInput(char *s) ret = trie_new(&root);
{ if (-1 == ret) {
scanf("%99s", s); fprintf(stderr, "Could not create trie\n");
return s; exit(1);
} }
int main() // open the dictionary file
{
// Read the file dictionary
int word_count = 0;
char *words[NUMBER_OF_WORDS];
FILE *fp = fopen("dictionary.txt", "r"); FILE *fp = fopen("dictionary.txt", "r");
if (NULL == fp) {
if (fp == 0)
{
fprintf(stderr, "Error while opening dictionary file"); fprintf(stderr, "Error while opening dictionary file");
exit(1); exit(1);
} }
words[word_count] = malloc(INPUT_WORD_SIZE); // insert all the words from the dictionary
while (1 == fscanf(fp, "%100s\n", word)) {
while (fgets(words[word_count], INPUT_WORD_SIZE, fp)) ret = trie_insert(root, word, strnlen(word, 100));
{ if (-1 == ret) {
word_count++; fprintf(stderr, "Could not insert word into trie\n");
words[word_count] = malloc(INPUT_WORD_SIZE); exit(1);
}
} }
// Push the words in to Trie while (1) {
TrieNode *root = NULL;
root = createTrieNode();
int i;
for (i = 0; i < NUMBER_OF_WORDS; i++)
{
insert(root, words[i]);
}
while (1)
{
printf("Enter keyword: "); printf("Enter keyword: ");
char str[100]; if (1 != scanf("%100s", word)) {
receiveInput(str); break;
}
printf( printf(
"\n==========================================================\n"); "\n==========================================================\n");
printf("\n********************* Possible Words ********************\n"); printf("\n********************* Possible Words ********************\n");
// Find the word through the Trie ret = trie_search(root, word, strnlen(word, 100), &trie);
traverse(str, root); if (-1 == ret) {
printf("No results\n");
continue;
}
printf( trie_print(trie, word, strnlen(word, 100));
"\n==========================================================\n");
printf("\n==========================================================\n");
} }
} }