/** * @file * @brief [McNaughton–Yamada–Thompson algorithm](https://en.wikipedia.org/wiki/Thompson%27s_construction) * @details * From Wikipedia: * In computer science, Thompson's construction algorithm, * also called the McNaughton–Yamada–Thompson algorithm, * is a method of transforming a regular expression into * an equivalent nondeterministic finite automaton (NFA). * This implementation implements the all three operations * (implicit concatenation, '|' for union, '*' for Kleene star) * required by the formal definition of regular expressions. * @author [Sharon Cassidy](https://github.com/CascadingCascade) */ #include /// for assert() #include /// for IO operations #include /// for string operations #include /// for memory management /* Begin declarations, I opted to place various helper / utility functions * close to their usages and didn't split their declaration / definition */ /** * @brief Definition for a binary abstract syntax tree (AST) node */ struct ASTNode { char content; ///< the content of this node struct ASTNode* left; ///< left child struct ASTNode* right; ///< right child }; struct ASTNode* createNode(char content); void destroyNode(struct ASTNode* node); char* preProcessing(const char* input); struct ASTNode* buildAST(const char* input); /** * @brief Definition for a NFA state transition rule */ struct transRule { struct NFAState* target; ///< pointer to the state to transit to char cond; ///< the input required to activate this transition }; struct transRule* createRule(struct NFAState* state, char c); void destroyRule(struct transRule* rule); /** * @brief Definition for a NFA state. Each NFAState object is initialized * to have a capacity of three rules, since there will only be at most two * outgoing rules and one empty character circular rule in this algorithm */ struct NFAState { int ruleCount; ///< number of transition rules this state have struct transRule** rules; ///< the transition rules }; struct NFAState* createState(void); void destroyState(struct NFAState* state); /** * @brief Definition for the NFA itself. * statePool[0] is defined to be its starting state, * and statePool[1] is defined to be its accepting state. * for simplicity's sake all NFAs are initialized to have * a small fixed capacity, although due to the recursive nature * of this algorithm this capacity is believed to be sufficient */ struct NFA { int stateCount; ///< the total number of states this NFA have struct NFAState** statePool; ///< the pool of all available states int ruleCount; ///< the total number of transition rules in this NFA struct transRule** rulePool; ///< the pool of all transition rules int CSCount; ///< the number of currently active states struct NFAState** currentStates; ///< the pool of all active states int subCount; ///< the number of sub NFAs struct NFA** subs; ///< the pool of all sub NFAs int wrapperFlag; ///< whether this NFA is a concatenation wrapper }; struct NFA* createNFA(void); void destroyNFA(struct NFA* nfa); void addState(struct NFA* nfa, struct NFAState* state); void addRule(struct NFA* nfa, struct transRule* rule, int loc); void postProcessing(struct NFA* nfa); void transit(struct NFA* nfa, char input); int isAccepting(const struct NFA* nfa); /* End definitions, begin abstract syntax tree construction */ /** * @brief helper function to determine whether a character should be * considered a character literal * @param ch the character to be tested * @returns `1` if it is a character literal * @returns `0` otherwise */ int isLiteral(const char ch) { return !(ch == '(' || ch == ')' || ch == '*' || ch == '\n' || ch == '|'); } /** * @brief performs preprocessing on a regex string, * making all implicit concatenations explicit * @param input target regex string * @returns pointer to the processing result */ char* preProcessing(const char* input) { const size_t len = strlen(input); if(len == 0) { char* str = malloc(1); str[0] = '\0'; return str; } char* str = malloc(len * 2); size_t op = 0; for (size_t i = 0; i < len - 1; ++i) { char c = input[i]; str[op++] = c; // one character lookahead char c1 = input[i + 1]; if( (isLiteral(c) && isLiteral(c1)) || (isLiteral(c) && c1 == '(') || (c == ')' && c1 == '(') || (c == ')' && isLiteral(c1)) || (c == '*' && isLiteral(c1)) || (c == '*' && c1 == '(') ) { // '\n' is used to represent concatenation // in this implementation str[op++] = '\n'; } } str[op++] = input[len - 1]; str[op] = '\0'; return str; } /** * @brief utility function to locate the first occurrence * of a character in a string while respecting parentheses * @param str target string * @param key the character to be located * @returns the index of its first occurrence, `0` if it could not be found */ size_t indexOf(const char* str, char key) { int depth = 0; for (size_t i = 0; i < strlen(str); ++i) { const char c = str[i]; if(depth == 0 && c == key) { return i; } if(c == '(') depth++; if(c == ')') depth--; } // Due to the way this function is intended to be used, // it's safe to assume the character will not appear as // the string's first character // thus `0` is used as the `not found` value return 0; } /** * @brief utility function to create a subString * @param str target string * @param begin starting index, inclusive * @param end ending index, inclusive * @returns pointer to the newly created subString */ char* subString(const char* str, size_t begin, size_t end) { char* res = malloc(end - begin + 2); strncpy(res, str + begin, end - begin + 1); res[end - begin + 1] = '\0'; return res; } /** * @brief recursively constructs a AST from a preprocessed regex string * @param input regex * @returns pointer to the resulting tree */ struct ASTNode* buildAST(const char* input) { struct ASTNode* node = createNode('\0'); node->left = NULL; node->right = NULL; const size_t len = strlen(input); size_t index; // Empty input if(len == 0) return node; // Character literals if(len == 1) { node->content = input[0]; return node; } // Discard parentheses if(input[0] == '(' && input[len - 1] == ')') { char* temp = subString(input, 1, len - 2); destroyNode(node); node = buildAST(temp); free(temp); return node; } // Union index = indexOf(input, '|'); if(index) { node->content = '|'; char* temp1 = subString(input, 0, index - 1); char* temp2 = subString(input, index + 1, len - 1); node->left = buildAST(temp1); node->right = buildAST(temp2); free(temp2); free(temp1); return node; } // Concatenation index = indexOf(input, '\n'); if(index) { node->content = '\n'; char* temp1 = subString(input, 0, index - 1); char* temp2 = subString(input, index + 1, len - 1); node->left = buildAST(temp1); node->right = buildAST(temp2); free(temp2); free(temp1); return node; } // Kleene star // Testing with indexOf() is unnecessary here, // Since all other possibilities have been exhausted node->content = '*'; char* temp = subString(input, 0, len - 2); node->left = buildAST(temp); node->right = NULL; free(temp); return node; } /* End AST construction, begins the actual algorithm itself */ /** * @brief helper function to recursively redirect transition rule targets * @param nfa target NFA * @param src the state to redirect away from * @param dest the state to redirect to * @returns void */ void redirect(struct NFA* nfa, struct NFAState* src, struct NFAState* dest) { for (int i = 0; i < nfa->subCount; ++i) { redirect(nfa->subs[i], src, dest); } for (int i = 0; i < nfa->ruleCount; ++i) { struct transRule* rule = nfa->rulePool[i]; if (rule->target == src) { rule->target = dest; } } } struct NFA* compileFromAST(struct ASTNode* root) { struct NFA* nfa = createNFA(); // Empty input if (root->content == '\0') { addRule(nfa, createRule(nfa->statePool[1], '\0'), 0); return nfa; } // Character literals if (isLiteral(root->content)) { addRule(nfa, createRule(nfa->statePool[1], root->content), 0); return nfa; } switch (root->content) { case '\n': { struct NFA* ln = compileFromAST(root->left); struct NFA* rn = compileFromAST(root->right); // Redirects all rules targeting ln's accepting state to // target rn's starting state redirect(ln, ln->statePool[1], rn->statePool[0]); // Manually creates and initializes a special // "wrapper" NFA destroyNFA(nfa); struct NFA* wrapper = malloc(sizeof(struct NFA)); wrapper->stateCount = 2; wrapper->statePool = malloc(sizeof(struct NFAState*) * 2); wrapper->subCount = 0; wrapper->subs = malloc(sizeof(struct NFA*) * 2); wrapper->ruleCount = 0; wrapper->rulePool = malloc(sizeof(struct transRule*) * 3); wrapper->CSCount = 0; wrapper->currentStates = malloc(sizeof(struct NFAState*) * 2); wrapper->wrapperFlag = 1; wrapper->subs[wrapper->subCount++] = ln; wrapper->subs[wrapper->subCount++] = rn; // Maps the wrapper NFA's starting and ending states // to its sub NFAs wrapper->statePool[0] = ln->statePool[0]; wrapper->statePool[1] = rn->statePool[1]; return wrapper; } case '|': { struct NFA* ln = compileFromAST(root->left); struct NFA* rn = compileFromAST(root->right); nfa->subs[nfa->subCount++] = ln; nfa->subs[nfa->subCount++] = rn; // Adds empty character transition rules addRule(nfa, createRule(ln->statePool[0], '\0'), 0); addRule(ln, createRule(nfa->statePool[1], '\0'), 1); addRule(nfa, createRule(rn->statePool[0], '\0'), 0); addRule(rn, createRule(nfa->statePool[1], '\0'), 1); return nfa; } case '*': { struct NFA* ln = compileFromAST(root->left); nfa->subs[nfa->subCount++] = ln; addRule(ln, createRule(ln->statePool[0], '\0'), 1); addRule(nfa, createRule(ln->statePool[0], '\0'), 0); addRule(ln, createRule(nfa->statePool[1], '\0'), 1); addRule(nfa, createRule(nfa->statePool[1], '\0'), 0); return nfa; } } // Fallback, shouldn't happen in normal operation destroyNFA(nfa); return NULL; } /* Ends the algorithm, begins NFA utility functions*/ /** * @brief adds a state to a NFA * @param nfa target NFA * @param state the NFA state to be added * @returns void */ void addState(struct NFA* nfa, struct NFAState* state) { nfa->statePool[nfa->stateCount++] = state; } /** * @brief adds a transition rule to a NFA * @param nfa target NFA * @param rule the rule to be added * @param loc which state this rule should be added to * @returns void */ void addRule(struct NFA* nfa, struct transRule* rule, int loc) { nfa->rulePool[nfa->ruleCount++] = rule; struct NFAState* state = nfa->statePool[loc]; state->rules[state->ruleCount++] = rule; } /** * @brief performs postprocessing on a compiled NFA, * add circular empty character transition rules where * it's needed for the NFA to function correctly * @param nfa target NFA * @returns void */ void postProcessing(struct NFA* nfa) { // Since the sub NFA's states and rules are managed // through their own pools, recursion is necessary for (int i = 0; i < nfa->subCount; ++i) { postProcessing(nfa->subs[i]); } // If a state does not have any empty character accepting rule, // we add a rule that circles back to itself // So this state will be preserved when // empty characters are inputted for (int i = 0; i < nfa->stateCount; ++i) { struct NFAState* pState = nfa->statePool[i]; int f = 0; for (int j = 0; j < pState->ruleCount; ++j) { if(pState->rules[j]->cond == '\0') { f = 1; break; } } if (!f) { addRule(nfa, createRule(pState, '\0'), i); } } } /** * @brief helper function to determine an element's presence in an array * @param states target array * @param len length of the target array * @param state the element to search for * @returns `1` if the element is present, `0` otherwise */ int contains(struct NFAState** states, int len, struct NFAState* state) { int f = 0; for (int i = 0; i < len; ++i) { if(states[i] == state) { f = 1; break; } } return f; } /** * @brief helper function to manage empty character transitions * @param target target NFA * @param states pointer to results storage location * @param sc pointer to results count storage location * @returns void */ void findEmpty(struct NFAState* target, struct NFAState** states, int *sc) { for (int i = 0; i < target->ruleCount; ++i) { const struct transRule *pRule = target->rules[i]; if (pRule->cond == '\0' && !contains(states, *sc, pRule->target)) { states[(*sc)++] = pRule->target; // the use of `states` and `sc` is necessary // to sync data across recursion levels findEmpty(pRule->target, states, sc); } } } /** * @brief moves a NFA forward * @param nfa target NFA * @param input the character to be fed into the NFA * @returns void */ void transit(struct NFA* nfa, char input) { struct NFAState** newStates = malloc(sizeof(struct NFAState*) * 10); int NSCount = 0; if (input == '\0') { // In case of empty character input, it's possible for // a state to transit to another state that's more than // one rule away, we need to take that into account for (int i = nfa->CSCount - 1; i > -1; --i) { struct NFAState *pState = nfa->currentStates[i]; nfa->CSCount--; struct NFAState** states = malloc(sizeof(struct NFAState*) * 10); int sc = 0; findEmpty(pState, states, &sc); for (int j = 0; j < sc; ++j) { if(!contains(newStates,NSCount, states[j])) { newStates[NSCount++] = states[j]; } } free(states); } } else { // Iterates through all current states for (int i = nfa->CSCount - 1; i > -1; --i) { struct NFAState *pState = nfa->currentStates[i]; // Gradually empties the current states pool, so // it can be refilled nfa->CSCount--; // Iterates through rules of this state for (int j = 0; j < pState->ruleCount; ++j) { const struct transRule *pRule = pState->rules[j]; if(pRule->cond == input) { if(!contains(newStates, NSCount, pRule->target)) { newStates[NSCount++] = pRule->target; } } } } } nfa->CSCount = NSCount; for (int i = 0; i < NSCount; ++i) { nfa->currentStates[i] = newStates[i]; } free(newStates); } /** * @brief determines whether the NFA is currently in its accepting state * @param nfa target NFA * @returns `1` if the NFA is in its accepting state * @returns `0` otherwise */ int isAccepting(const struct NFA* nfa) { for (int i = 0; i < nfa->CSCount; ++i) { if(nfa->currentStates[i] == nfa->statePool[1]) { return 1; } } return 0; } /* Ends NFA utilities, begins testing function*/ /** * @brief Testing helper function * @param regex the regular expression to be used * @param string the string to match against * @param expected expected results * @returns void */ void testHelper(const char* regex, const char* string, const int expected) { char* temp = preProcessing(regex); struct ASTNode* node = buildAST(temp); struct NFA* nfa = compileFromAST(node); postProcessing(nfa); // reallocates the outermost NFA's current states pool // because it will actually be used to store all the states nfa->currentStates = realloc(nfa->currentStates, sizeof(struct NFAState*) * 100); // Starts the NFA by adding its starting state to the pool nfa->currentStates[nfa->CSCount++] = nfa->statePool[0]; // feeds empty characters into the NFA before and after // every normal character for (size_t i = 0; i < strlen(string); ++i) { transit(nfa, '\0'); transit(nfa, string[i]); } transit(nfa, '\0'); assert(isAccepting(nfa) == expected); destroyNFA(nfa); destroyNode(node); free(temp); } /** * @brief Self-test implementations * @returns void */ static void test(void) { testHelper("(c|a*b)", "c", 1); testHelper("(c|a*b)", "aab", 1); testHelper("(c|a*b)", "ca", 0); testHelper("(c|a*b)*", "caaab", 1); testHelper("(c|a*b)*", "caba", 0); testHelper("", "", 1); testHelper("", "1", 0); testHelper("(0|(1(01*(00)*0)*1)*)*","11",1); testHelper("(0|(1(01*(00)*0)*1)*)*","110",1); testHelper("(0|(1(01*(00)*0)*1)*)*","1100",1); testHelper("(0|(1(01*(00)*0)*1)*)*","10000",0); testHelper("(0|(1(01*(00)*0)*1)*)*","00000",1); printf("All tests have successfully passed!\n"); } /** * @brief Main function * @returns 0 on exit */ int main(void) { test(); // run self-test implementations return 0; } /* I opted to place these more-or-less boilerplate code and their docs * at the end of file for better readability */ /** * @brief creates and initializes a AST node * @param content data to initializes the node with * @returns pointer to the newly created node */ struct ASTNode* createNode(const char content) { struct ASTNode* node = malloc(sizeof(struct ASTNode)); node->content = content; node->left = NULL; node->right = NULL; return node; } /** * @brief recursively destroys a AST * @param node the root node of the tree to be deleted * @returns void */ void destroyNode(struct ASTNode* node) { if(node->left != NULL) { destroyNode(node->left); } if(node->right != NULL) { destroyNode(node->right); } free(node); } /** * @brief creates and initializes a transition rule * @param state transition target * @param c transition condition * @returns pointer to the newly created rule */ struct transRule* createRule(struct NFAState* state, char c) { struct transRule* rule = malloc(sizeof(struct transRule)); rule->target = state; rule->cond = c; return rule; } /** * @brief destroys a transition rule object * @param rule pointer to the object to be deleted * @returns void */ void destroyRule(struct transRule* rule) { free(rule); } /** * @brief creates and initializes a NFA state * @returns pointer to the newly created NFA state */ struct NFAState* createState(void) { struct NFAState* state = malloc(sizeof(struct NFAState)); state->ruleCount = 0; state->rules = malloc(sizeof(struct transRule*) * 3); return state; } /** * @brief destroys a NFA state * @param state pointer to the object to be deleted * @returns void */ void destroyState(struct NFAState* state) { free(state->rules); free(state); } /** * @brief creates and initializes a NFA * @returns pointer to the newly created NFA */ struct NFA* createNFA(void) { struct NFA* nfa = malloc(sizeof(struct NFA)); nfa->stateCount = 0; nfa->statePool = malloc(sizeof(struct NFAState*) * 5); nfa->ruleCount = 0; nfa->rulePool = malloc(sizeof(struct transRule*) * 10); nfa->CSCount = 0; nfa->currentStates = malloc(sizeof(struct NFAState*) * 5); nfa->subCount = 0; nfa->subs = malloc(sizeof(struct NFA*) * 5); nfa->wrapperFlag = 0; addState(nfa, createState()); addState(nfa, createState()); return nfa; } /** * @brief recursively destroys a NFA * @param nfa pointer to the object to be deleted * @returns void */ void destroyNFA(struct NFA* nfa) { for (int i = 0; i < nfa->subCount; ++i) { destroyNFA(nfa->subs[i]); } // In case of a wrapper NFA, do not free its states // because it doesn't really have any states of its own if (!nfa->wrapperFlag) { for (int i = 0; i < nfa->stateCount; ++i) { destroyState(nfa->statePool[i]); } } for (int i = 0; i < nfa->ruleCount; ++i) { destroyRule(nfa->rulePool[i]); } free(nfa->statePool); free(nfa->currentStates); free(nfa->rulePool); free(nfa->subs); free(nfa); }