From e1fcdd2a0487a13d42ae44206b8d71af61b3a610 Mon Sep 17 00:00:00 2001 From: "Sharon \"Cass\" Cassidy" Date: Wed, 12 Apr 2023 08:57:27 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20Add=20McNaughton=E2=80=93Yamada?= =?UTF-8?q?=E2=80=93Thompson=20algorithm=20(#1241)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * updating DIRECTORY.md * Create mcnaughton_yamada_thompson.c * updating DIRECTORY.md * Update mcnaughton_yamada_thompson.c * fix some memory leaks * fix another memory leak * Update mcnaughton_yamada_thompson.c * added more test cases * a few formatting changes * another few SPaG changes * Update misc/mcnaughton_yamada_thompson.c Co-authored-by: David Leal * updating DIRECTORY.md --------- Co-authored-by: github-actions[bot] Co-authored-by: David Leal --- DIRECTORY.md | 2 + misc/mcnaughton_yamada_thompson.c | 721 ++++++++++++++++++++++++++++++ 2 files changed, 723 insertions(+) create mode 100644 misc/mcnaughton_yamada_thompson.c diff --git a/DIRECTORY.md b/DIRECTORY.md index 1e49c052..cad509a7 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -205,6 +205,7 @@ * [Hamming Distance](https://github.com/TheAlgorithms/C/blob/HEAD/misc/hamming_distance.c) * [Lexicographic Permutations](https://github.com/TheAlgorithms/C/blob/HEAD/misc/lexicographic_permutations.c) * [Longest Subsequence](https://github.com/TheAlgorithms/C/blob/HEAD/misc/longest_subsequence.c) + * [Mcnaughton Yamada Thompson](https://github.com/TheAlgorithms/C/blob/HEAD/misc/mcnaughton_yamada_thompson.c) * [Mirror](https://github.com/TheAlgorithms/C/blob/HEAD/misc/mirror.c) * [Pid](https://github.com/TheAlgorithms/C/blob/HEAD/misc/pid.c) * [Poly Add](https://github.com/TheAlgorithms/C/blob/HEAD/misc/poly_add.c) @@ -218,6 +219,7 @@ * [Union Find](https://github.com/TheAlgorithms/C/blob/HEAD/misc/union_find.c) ## Numerical Methods + * [Bisection Method](https://github.com/TheAlgorithms/C/blob/HEAD/numerical_methods/bisection_method.c) * [Durand Kerner Roots](https://github.com/TheAlgorithms/C/blob/HEAD/numerical_methods/durand_kerner_roots.c) * [Gauss Elimination](https://github.com/TheAlgorithms/C/blob/HEAD/numerical_methods/gauss_elimination.c) * [Gauss Seidel Method](https://github.com/TheAlgorithms/C/blob/HEAD/numerical_methods/gauss_seidel_method.c) diff --git a/misc/mcnaughton_yamada_thompson.c b/misc/mcnaughton_yamada_thompson.c new file mode 100644 index 00000000..9f13ae03 --- /dev/null +++ b/misc/mcnaughton_yamada_thompson.c @@ -0,0 +1,721 @@ +/** + * @file + * @brief [McNaughton–Yamada–Thompson algorithm](https://en.wikipedia.org/wiki/Thompson%27s_construction) + * @details + * From Wikipedia: + * In computer science, Thompson's construction algorithm, + * also called the McNaughton–Yamada–Thompson algorithm, + * is a method of transforming a regular expression into + * an equivalent nondeterministic finite automaton (NFA). + * This implementation implements the all three operations + * (implicit concatenation, '|' for union, '*' for Kleene star) + * required by the formal definition of regular expressions. + * @author [Sharon Cassidy](https://github.com/CascadingCascade) + */ + +#include /// for assert() +#include /// for IO operations +#include /// for string operations +#include /// for memory management + +/* Begin declarations, I opted to place various helper / utility functions + * close to their usages and didn't split their declaration / definition */ + +/** + * @brief Definition for a binary abstract syntax tree (AST) node + */ +struct ASTNode { + char content; ///< the content of this node + struct ASTNode* left; ///< left child + struct ASTNode* right; ///< right child +}; + +struct ASTNode* createNode(char content); +void destroyNode(struct ASTNode* node); +char* preProcessing(const char* input); +struct ASTNode* buildAST(const char* input); + +/** + * @brief Definition for a NFA state transition rule + */ +struct transRule { + struct NFAState* target; ///< pointer to the state to transit to + char cond; ///< the input required to activate this transition +}; + +struct transRule* createRule(struct NFAState* state, char c); +void destroyRule(struct transRule* rule); + +/** + * @brief Definition for a NFA state. Each NFAState object is initialized + * to have a capacity of three rules, since there will only be at most two + * outgoing rules and one empty character circular rule in this algorithm + */ +struct NFAState { + int ruleCount; ///< number of transition rules this state have + struct transRule** rules; ///< the transition rules +}; + +struct NFAState* createState(void); +void destroyState(struct NFAState* state); + +/** + * @brief Definition for the NFA itself. + * statePool[0] is defined to be its starting state, + * and statePool[1] is defined to be its accepting state. + * for simplicity's sake all NFAs are initialized to have + * a small fixed capacity, although due to the recursive nature + * of this algorithm this capacity is believed to be sufficient + */ +struct NFA { + int stateCount; ///< the total number of states this NFA have + struct NFAState** statePool; ///< the pool of all available states + int ruleCount; ///< the total number of transition rules in this NFA + struct transRule** rulePool; ///< the pool of all transition rules + int CSCount; ///< the number of currently active states + struct NFAState** currentStates; ///< the pool of all active states + int subCount; ///< the number of sub NFAs + struct NFA** subs; ///< the pool of all sub NFAs + int wrapperFlag; ///< whether this NFA is a concatenation wrapper +}; + +struct NFA* createNFA(void); +void destroyNFA(struct NFA* nfa); +void addState(struct NFA* nfa, struct NFAState* state); +void addRule(struct NFA* nfa, struct transRule* rule, int loc); +void postProcessing(struct NFA* nfa); +void transit(struct NFA* nfa, char input); +int isAccepting(const struct NFA* nfa); + +/* End definitions, begin abstract syntax tree construction */ + +/** + * @brief helper function to determine whether a character should be + * considered a character literal + * @param ch the character to be tested + * @returns `1` if it is a character literal + * @returns `0` otherwise + */ +int isLiteral(const char ch) { + return !(ch == '(' || ch == ')' || ch == '*' || ch == '\n' || ch == '|'); +} + +/** + * @brief performs preprocessing on a regex string, + * making all implicit concatenations explicit + * @param input target regex string + * @returns pointer to the processing result + */ +char* preProcessing(const char* input) { + const size_t len = strlen(input); + if(len == 0) { + char* str = malloc(1); + str[0] = '\0'; + return str; + } + + char* str = malloc(len * 2); + size_t op = 0; + + for (size_t i = 0; i < len - 1; ++i) { + char c = input[i]; + str[op++] = c; + // one character lookahead + char c1 = input[i + 1]; + + if( (isLiteral(c) && isLiteral(c1)) || + (isLiteral(c) && c1 == '(') || + (c == ')' && c1 == '(') || + (c == ')' && isLiteral(c1)) || + (c == '*' && isLiteral(c1)) || + (c == '*' && c1 == '(') + ) { + // '\n' is used to represent concatenation + // in this implementation + str[op++] = '\n'; + } + } + + str[op++] = input[len - 1]; + str[op] = '\0'; + return str; +} + +/** + * @brief utility function to locate the first occurrence + * of a character in a string while respecting parentheses + * @param str target string + * @param key the character to be located + * @returns the index of its first occurrence, `0` if it could not be found + */ +size_t indexOf(const char* str, char key) { + int depth = 0; + + for (size_t i = 0; i < strlen(str); ++i) { + const char c = str[i]; + + if(depth == 0 && c == key) { + return i; + } + if(c == '(') depth++; + if(c == ')') depth--; + } + // Due to the way this function is intended to be used, + // it's safe to assume the character will not appear as + // the string's first character + // thus `0` is used as the `not found` value + return 0; +} + +/** + * @brief utility function to create a subString + * @param str target string + * @param begin starting index, inclusive + * @param end ending index, inclusive + * @returns pointer to the newly created subString + */ +char* subString(const char* str, size_t begin, size_t end) { + char* res = malloc(end - begin + 2); + strncpy(res, str + begin, end - begin + 1); + res[end - begin + 1] = '\0'; + return res; +} + +/** + * @brief recursively constructs a AST from a preprocessed regex string + * @param input regex + * @returns pointer to the resulting tree + */ +struct ASTNode* buildAST(const char* input) { + + struct ASTNode* node = createNode('\0'); + node->left = NULL; + node->right = NULL; + const size_t len = strlen(input); + size_t index; + + // Empty input + if(len == 0) return node; + + // Character literals + if(len == 1) { + node->content = input[0]; + return node; + } + + // Discard parentheses + if(input[0] == '(' && input[len - 1] == ')') { + char* temp = subString(input, 1, len - 2); + destroyNode(node); + node = buildAST(temp); + + free(temp); + return node; + } + + // Union + index = indexOf(input, '|'); + if(index) { + node->content = '|'; + + char* temp1 = subString(input, 0, index - 1); + char* temp2 = subString(input, index + 1, len - 1); + node->left = buildAST(temp1); + node->right = buildAST(temp2); + + free(temp2); + free(temp1); + return node; + } + + // Concatenation + index = indexOf(input, '\n'); + if(index) { + node->content = '\n'; + + char* temp1 = subString(input, 0, index - 1); + char* temp2 = subString(input, index + 1, len - 1); + node->left = buildAST(temp1); + node->right = buildAST(temp2); + + free(temp2); + free(temp1); + return node; + } + + // Kleene star + // Testing with indexOf() is unnecessary here, + // Since all other possibilities have been exhausted + node->content = '*'; + char* temp = subString(input, 0, len - 2); + node->left = buildAST(temp); + node->right = NULL; + + free(temp); + return node; +} + +/* End AST construction, begins the actual algorithm itself */ + +/** + * @brief helper function to recursively redirect transition rule targets + * @param nfa target NFA + * @param src the state to redirect away from + * @param dest the state to redirect to + * @returns void + */ +void redirect(struct NFA* nfa, struct NFAState* src, struct NFAState* dest) { + for (int i = 0; i < nfa->subCount; ++i) { + redirect(nfa->subs[i], src, dest); + } + for (int i = 0; i < nfa->ruleCount; ++i) { + struct transRule* rule = nfa->rulePool[i]; + if (rule->target == src) { + rule->target = dest; + } + } +} + +struct NFA* compileFromAST(struct ASTNode* root) { + + struct NFA* nfa = createNFA(); + + // Empty input + if (root->content == '\0') { + addRule(nfa, createRule(nfa->statePool[1], '\0'), 0); + return nfa; + } + + // Character literals + if (isLiteral(root->content)) { + addRule(nfa, createRule(nfa->statePool[1], root->content), 0); + return nfa; + } + + switch (root->content) { + + case '\n': { + struct NFA* ln = compileFromAST(root->left); + struct NFA* rn = compileFromAST(root->right); + + // Redirects all rules targeting ln's accepting state to + // target rn's starting state + redirect(ln, ln->statePool[1], rn->statePool[0]); + + // Manually creates and initializes a special + // "wrapper" NFA + destroyNFA(nfa); + struct NFA* wrapper = malloc(sizeof(struct NFA)); + wrapper->stateCount = 2; + wrapper->statePool = malloc(sizeof(struct NFAState*) * 2); + wrapper->subCount = 0; + wrapper->subs = malloc(sizeof(struct NFA*) * 2); + wrapper->ruleCount = 0; + wrapper->rulePool = malloc(sizeof(struct transRule*) * 3); + wrapper->CSCount = 0; + wrapper->currentStates = malloc(sizeof(struct NFAState*) * 2); + wrapper->wrapperFlag = 1; + wrapper->subs[wrapper->subCount++] = ln; + wrapper->subs[wrapper->subCount++] = rn; + + // Maps the wrapper NFA's starting and ending states + // to its sub NFAs + wrapper->statePool[0] = ln->statePool[0]; + wrapper->statePool[1] = rn->statePool[1]; + + return wrapper; + } + case '|': { + + struct NFA* ln = compileFromAST(root->left); + struct NFA* rn = compileFromAST(root->right); + nfa->subs[nfa->subCount++] = ln; + nfa->subs[nfa->subCount++] = rn; + + // Adds empty character transition rules + addRule(nfa, createRule(ln->statePool[0], '\0'), 0); + addRule(ln, createRule(nfa->statePool[1], '\0'), 1); + addRule(nfa, createRule(rn->statePool[0], '\0'), 0); + addRule(rn, createRule(nfa->statePool[1], '\0'), 1); + + return nfa; + } + case '*': { + struct NFA* ln = compileFromAST(root->left); + nfa->subs[nfa->subCount++] = ln; + + addRule(ln, createRule(ln->statePool[0], '\0'), 1); + addRule(nfa, createRule(ln->statePool[0], '\0'), 0); + addRule(ln, createRule(nfa->statePool[1], '\0'), 1); + addRule(nfa, createRule(nfa->statePool[1], '\0'), 0); + + return nfa; + } + } + + // Fallback, shouldn't happen in normal operation + destroyNFA(nfa); + return NULL; +} + +/* Ends the algorithm, begins NFA utility functions*/ + +/** + * @brief adds a state to a NFA + * @param nfa target NFA + * @param state the NFA state to be added + * @returns void + */ +void addState(struct NFA* nfa, struct NFAState* state) { + nfa->statePool[nfa->stateCount++] = state; +} + +/** + * @brief adds a transition rule to a NFA + * @param nfa target NFA + * @param rule the rule to be added + * @param loc which state this rule should be added to + * @returns void + */ +void addRule(struct NFA* nfa, struct transRule* rule, int loc) { + nfa->rulePool[nfa->ruleCount++] = rule; + struct NFAState* state = nfa->statePool[loc]; + state->rules[state->ruleCount++] = rule; +} + +/** + * @brief performs postprocessing on a compiled NFA, + * add circular empty character transition rules where + * it's needed for the NFA to function correctly + * @param nfa target NFA + * @returns void + */ +void postProcessing(struct NFA* nfa) { + // Since the sub NFA's states and rules are managed + // through their own pools, recursion is necessary + for (int i = 0; i < nfa->subCount; ++i) { + postProcessing(nfa->subs[i]); + } + + // If a state does not have any empty character accepting rule, + // we add a rule that circles back to itself + // So this state will be preserved when + // empty characters are inputted + for (int i = 0; i < nfa->stateCount; ++i) { + + struct NFAState* pState = nfa->statePool[i]; + int f = 0; + for (int j = 0; j < pState->ruleCount; ++j) { + if(pState->rules[j]->cond == '\0') { + f = 1; + break; + } + } + + if (!f) { + addRule(nfa, createRule(pState, '\0'), i); + } + } +} + +/** + * @brief helper function to determine an element's presence in an array + * @param states target array + * @param len length of the target array + * @param state the element to search for + * @returns `1` if the element is present, `0` otherwise + */ +int contains(struct NFAState** states, int len, struct NFAState* state) { + int f = 0; + for (int i = 0; i < len; ++i) { + if(states[i] == state) { + f = 1; + break; + } + } + return f; +} + +/** + * @brief helper function to manage empty character transitions + * @param target target NFA + * @param states pointer to results storage location + * @param sc pointer to results count storage location + * @returns void + */ +void findEmpty(struct NFAState* target, struct NFAState** states, int *sc) { + for (int i = 0; i < target->ruleCount; ++i) { + const struct transRule *pRule = target->rules[i]; + + if (pRule->cond == '\0' && !contains(states, *sc, pRule->target)) { + states[(*sc)++] = pRule->target; + // the use of `states` and `sc` is necessary + // to sync data across recursion levels + findEmpty(pRule->target, states, sc); + } + } +} + +/** + * @brief moves a NFA forward + * @param nfa target NFA + * @param input the character to be fed into the NFA + * @returns void + */ +void transit(struct NFA* nfa, char input) { + struct NFAState** newStates = malloc(sizeof(struct NFAState*) * 10); + int NSCount = 0; + + if (input == '\0') { + // In case of empty character input, it's possible for + // a state to transit to another state that's more than + // one rule away, we need to take that into account + for (int i = nfa->CSCount - 1; i > -1; --i) { + struct NFAState *pState = nfa->currentStates[i]; + nfa->CSCount--; + + struct NFAState** states = malloc(sizeof(struct NFAState*) * 10); + int sc = 0; + findEmpty(pState, states, &sc); + + for (int j = 0; j < sc; ++j) { + if(!contains(newStates,NSCount, states[j])) { + newStates[NSCount++] = states[j]; + } + } + free(states); + } + } else { + // Iterates through all current states + for (int i = nfa->CSCount - 1; i > -1; --i) { + struct NFAState *pState = nfa->currentStates[i]; + // Gradually empties the current states pool, so + // it can be refilled + nfa->CSCount--; + + // Iterates through rules of this state + for (int j = 0; j < pState->ruleCount; ++j) { + const struct transRule *pRule = pState->rules[j]; + + if(pRule->cond == input) { + if(!contains(newStates, NSCount, pRule->target)) { + newStates[NSCount++] = pRule->target; + } + } + } + } + } + + nfa->CSCount = NSCount; + for (int i = 0; i < NSCount; ++i) { + nfa->currentStates[i] = newStates[i]; + } + free(newStates); +} + +/** + * @brief determines whether the NFA is currently in its accepting state + * @param nfa target NFA + * @returns `1` if the NFA is in its accepting state + * @returns `0` otherwise + */ +int isAccepting(const struct NFA* nfa) { + for (int i = 0; i < nfa->CSCount; ++i) { + if(nfa->currentStates[i] == nfa->statePool[1]) { + return 1; + } + } + return 0; +} + +/* Ends NFA utilities, begins testing function*/ + +/** + * @brief Testing helper function + * @param regex the regular expression to be used + * @param string the string to match against + * @param expected expected results + * @returns void + */ +void testHelper(const char* regex, const char* string, const int expected) { + char* temp = preProcessing(regex); + struct ASTNode* node = buildAST(temp); + + struct NFA* nfa = compileFromAST(node); + postProcessing(nfa); + + // reallocates the outermost NFA's current states pool + // because it will actually be used to store all the states + nfa->currentStates = realloc(nfa->currentStates, sizeof(struct NFAState*) * 100); + // Starts the NFA by adding its starting state to the pool + nfa->currentStates[nfa->CSCount++] = nfa->statePool[0]; + + // feeds empty characters into the NFA before and after + // every normal character + for (size_t i = 0; i < strlen(string); ++i) { + transit(nfa, '\0'); + transit(nfa, string[i]); + } + transit(nfa, '\0'); + + assert(isAccepting(nfa) == expected); + + destroyNFA(nfa); + destroyNode(node); + free(temp); +} + +/** + * @brief Self-test implementations + * @returns void + */ +static void test(void) { + testHelper("(c|a*b)", "c", 1); + testHelper("(c|a*b)", "aab", 1); + testHelper("(c|a*b)", "ca", 0); + testHelper("(c|a*b)*", "caaab", 1); + testHelper("(c|a*b)*", "caba", 0); + testHelper("", "", 1); + testHelper("", "1", 0); + testHelper("(0|(1(01*(00)*0)*1)*)*","11",1); + testHelper("(0|(1(01*(00)*0)*1)*)*","110",1); + testHelper("(0|(1(01*(00)*0)*1)*)*","1100",1); + testHelper("(0|(1(01*(00)*0)*1)*)*","10000",0); + testHelper("(0|(1(01*(00)*0)*1)*)*","00000",1); + + printf("All tests have successfully passed!\n"); +} + +/** + * @brief Main function + * @returns 0 on exit + */ +int main(void) { + test(); // run self-test implementations + return 0; +} + +/* I opted to place these more-or-less boilerplate code and their docs + * at the end of file for better readability */ + +/** + * @brief creates and initializes a AST node + * @param content data to initializes the node with + * @returns pointer to the newly created node + */ +struct ASTNode* createNode(const char content) { + struct ASTNode* node = malloc(sizeof(struct ASTNode)); + node->content = content; + node->left = NULL; + node->right = NULL; + return node; +} + +/** + * @brief recursively destroys a AST + * @param node the root node of the tree to be deleted + * @returns void + */ +void destroyNode(struct ASTNode* node) { + if(node->left != NULL) { + destroyNode(node->left); + } + + if(node->right != NULL) { + destroyNode(node->right); + } + + free(node); +} + +/** + * @brief creates and initializes a transition rule + * @param state transition target + * @param c transition condition + * @returns pointer to the newly created rule + */ +struct transRule* createRule(struct NFAState* state, char c) { + struct transRule* rule = malloc(sizeof(struct transRule)); + rule->target = state; + rule->cond = c; + return rule; +} + +/** + * @brief destroys a transition rule object + * @param rule pointer to the object to be deleted + * @returns void + */ +void destroyRule(struct transRule* rule) { + free(rule); +} + +/** + * @brief creates and initializes a NFA state + * @returns pointer to the newly created NFA state + */ +struct NFAState* createState(void) { + struct NFAState* state = malloc(sizeof(struct NFAState)); + state->ruleCount = 0; + state->rules = malloc(sizeof(struct transRule*) * 3); + return state; +} + +/** + * @brief destroys a NFA state + * @param state pointer to the object to be deleted + * @returns void + */ +void destroyState(struct NFAState* state) { + free(state->rules); + free(state); +} + +/** + * @brief creates and initializes a NFA + * @returns pointer to the newly created NFA + */ +struct NFA* createNFA(void) { + struct NFA* nfa = malloc(sizeof(struct NFA)); + + nfa->stateCount = 0; + nfa->statePool = malloc(sizeof(struct NFAState*) * 5); + nfa->ruleCount = 0; + nfa->rulePool = malloc(sizeof(struct transRule*) * 10); + nfa->CSCount = 0; + nfa->currentStates = malloc(sizeof(struct NFAState*) * 5); + nfa->subCount = 0; + nfa->subs = malloc(sizeof(struct NFA*) * 5); + nfa->wrapperFlag = 0; + + addState(nfa, createState()); + addState(nfa, createState()); + return nfa; +} + +/** + * @brief recursively destroys a NFA + * @param nfa pointer to the object to be deleted + * @returns void + */ +void destroyNFA(struct NFA* nfa) { + for (int i = 0; i < nfa->subCount; ++i) { + destroyNFA(nfa->subs[i]); + } + + // In case of a wrapper NFA, do not free its states + // because it doesn't really have any states of its own + if (!nfa->wrapperFlag) { + for (int i = 0; i < nfa->stateCount; ++i) { + destroyState(nfa->statePool[i]); + } + } + for (int i = 0; i < nfa->ruleCount; ++i) { + destroyRule(nfa->rulePool[i]); + } + free(nfa->statePool); + free(nfa->currentStates); + free(nfa->rulePool); + free(nfa->subs); + free(nfa); +}