diff --git a/apps/grep.c b/apps/grep.c new file mode 100644 index 00000000..d7e056f1 --- /dev/null +++ b/apps/grep.c @@ -0,0 +1,385 @@ +/** + * @brief grep - mediocre grep + * + * Based on the regex search matcher in bim. + * + * @copyright + * This file is part of ToaruOS and is released under the terms + * of the NCSA / University of Illinois License - see LICENSE.md + * Copyright (C) 2022 K. Lange + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINE_SIZE 4096 + +static int invert = 0; +static int ignorecase = 0; +static int quiet = 0; +static int only_matching = 0; +static int counts = 0; + +struct MatchQualifier { + int (*matchFunc)(struct MatchQualifier*,char,int); + union { + char matchChar; + struct { + char * start; + char * end; + } matchSquares; + }; +}; + +/** + * Helper for handling smart case sensitivity. + */ +int match_char(struct MatchQualifier * self, char b, int mode) { + if (mode == 0) { + return self->matchChar == b; + } else if (mode == 1) { + return tolower(self->matchChar) == tolower(b); + } + return 0; +} + +int match_squares(struct MatchQualifier * self, char c, int mode) { + char * start = self->matchSquares.start; + char * end = self->matchSquares.end; + char * t = start; + int good = 1; + if (*t == '^') { t++; good = 0; } + while (t != end) { + char test = *t++; + if (test == '\\' && *t && strchr("\\]",*t)) { + test = *t++; + } else if (test == '\\' && *t == 't') { + test = '\t'; t++; + } + + if (*t == '-') { + t++; + if (t == end) return 0; + char right = *t++; + if (right == '\\' && *t && strchr("\\]",*t)) { + right = *t++; + } else if (right == '\\' && *t == 't') { + right = '\t'; t++; + } + if (mode ? (tolower(c) >= tolower(test) && tolower(c) <= tolower(right)) : (c >= test && c <= right)) return good; + } else { + if (mode ? (tolower(c) == tolower(test)) : (c == test)) return good; + } + } + return !good; +} + +int match_dot(struct MatchQualifier * self, char c, int mode) { + return 1; +} + +struct BackRef { + int start; + int len; + uint32_t * copy; +}; + +struct Line { + int actual; + char * text; +}; + +#define MAX_REFS 10 +int regex_matches(struct Line * line, int j, char * needle, int ignorecase, int *len, char **needleout, int refindex, struct BackRef * refs) { + int k = j; + char * match = needle; + if (*match == '^') { + if (j != 0) return 0; + match++; + } + while (k < line->actual + 1) { + if (needleout && *match == ')') { + *needleout = match + 1; + if (len) *len = k - j; + return 1; + } + if (*match == '\0') { + if (needleout) return 0; + if (len) *len = k - j; + return 1; + } + if (*match == '$') { + if (k != line->actual) return 0; + match++; + continue; + } + if (k == line->actual) break; + + struct MatchQualifier matcher = {match_char, .matchChar=*match}; + if (*match == '.') { + matcher.matchFunc = match_dot; + match++; + } else if (*match == '\\' && strchr("$^/\\.[?]*+()",match[1]) != NULL) { + matcher.matchChar = match[1]; + match += 2; + } else if (*match == '\\' && match[1] == 't') { + matcher.matchChar = '\t'; + match += 2; + } else if (*match == '[') { + char * s = match+1; + char * e = s; + while (*e && *e != ']') { + if (*e == '\\' && e[1] == ']') e++; + e++; + } + if (!*e) break; /* fail match on unterminated [] sequence */ + match = e + 1; + matcher.matchFunc = match_squares; + matcher.matchSquares.start = s; + matcher.matchSquares.end = e; + } else if (*match == '(') { + match++; + int _len; + char * newmatch; + if (!regex_matches(line, k, match, ignorecase, &_len, &newmatch, 0, NULL)) break; + match = newmatch; + if (refindex && refindex < MAX_REFS) { + refs[refindex].start = k; + refs[refindex].len = _len; + refindex++; + } + k += _len; + continue; + } else { + match++; + } + if (*match == '?') { + /* Optional */ + match++; + if (matcher.matchFunc(&matcher, line->text[k], ignorecase)) { + int _len; + if (regex_matches(line,k+1,match,ignorecase,&_len, needleout, refindex, refs)) { + if (len) *len = _len + k + 1 - j; + return 1; + } + } + continue; + } else if (*match == '+' || *match == '*') { + /* Must match at least one */ + if (*match == '+') { + if (!matcher.matchFunc(&matcher, line->text[k], ignorecase)) break; + k++; + } + /* Match any */ + match++; + int greedy = 1; + if (*match == '?') { + /* non-greedy */ + match++; + greedy = 0; + } + + int _j = k; + while (_j < line->actual + 1) { + int _len; + if (!greedy && regex_matches(line, _j, match, ignorecase, &_len, needleout, refindex, refs)) { + if (len) *len = _len + _j - j; + return 1; + } + if (_j < line->actual && !matcher.matchFunc(&matcher, line->text[_j], ignorecase)) break; + _j++; + } + if (!greedy) return 0; + while (_j >= k) { + int _len; + if (regex_matches(line, _j, match, ignorecase, &_len, needleout, refindex, refs)) { + if (len) *len = _len + _j - j; + return 1; + } + _j--; + } + return 0; + } else { + if (!matcher.matchFunc(&matcher, line->text[k], ignorecase)) break; + k++; + } + } + return 0; +} + +int subsearch_matches(struct Line * line, int j, char * needle, int *len) { + return regex_matches(line, j, needle, ignorecase, len, NULL, 0, NULL); +} + +int usage(char ** argv) { +#define _I "\033[3m" +#define _E "\033[0m\n" + fprintf(stderr, "usage: %s [-ivqoc] PATTERN [FILE...]\n" + "\n" + " Supported options:\n" + " -c " _I "Instead of printing matches, print counts of matched lines." _E + " -i " _I "Ignore case in input and pattern." _E + " -o " _I "Print only the matching parts of each line, separating\n" + " each match with a line feed." _E + " -q " _I "Exit immediately with 0 when a match (or, with -v,\n" + " non-match) is found, do not print matches." _E + " -v " _I "Invert match - print lines that do not match pattern." _E + "\n" + " Supported regex syntax:\n" + " [abc] " _I "Match one of a set of characters." _E + " [a-z] " _I "Match one from a range of characters." _E + " (abc) " _I "Match a group; does nothing here, supported for compatibility\n" + " with bim and a possible future sed implementation." _E + " . " _I "Match any single character." _E + " ^ " _I "Match the start of the line." _E + " $ " _I "Match the end of the line." _E + "\n" + " Modifiers (can be combined with [], ., and single characters):\n" + " ? " _I "Match optionally" _E + " * " _I "Match any number of occurances" _E + " + " _I "Match at least one occurance" _E + " *? +? " _I "Non-greedy match variants of * and +" _E + "\n" + " Some characters can be escaped in the pattern with \\.\n" + " The regex engine is not Unicode-aware.\n", + argv[0]); +#undef _I +#undef _E + return 1; +} + +#define LINE_SIZE 4096 + +int main(int argc, char ** argv) { + int opt; + while ((opt = getopt(argc, argv, "?hivqoc")) != -1) { + switch (opt) { + case 'h': + case '?': + return usage(argv); + case 'i': + ignorecase = 1; + break; + case 'v': + invert = 1; + break; + case 'q': + quiet = 1; + break; + case 'o': + only_matching = 1; + break; + case 'c': + counts = 1; + break; + } + } + + if (optind == argc) return usage(argv); + + char * needle = argv[optind]; + char buf[LINE_SIZE]; + int ret = 1; + int is_tty = isatty(STDOUT_FILENO); + + optind++; + + + int showFilenames = (optind + 1 != argc); + + do { + FILE * input = stdin; + int count = 0; + if (optind < argc && strcmp(argv[optind],"-")) { + input = fopen(argv[optind], "r"); + if (!input) { + fprintf(stderr, "%s: %s: %s\n", argv[0], argv[optind], strerror(errno)); + return 1; + } + } + + const char * filename = input == stdin ? "(standard input)" : argv[optind]; + + while (fgets(buf, LINE_SIZE, input)) { + int lineLength = strlen(buf); + if (lineLength && buf[lineLength-1] == '\n') { + lineLength--; + } + struct Line line = { + lineLength, + buf + }; + + if (!invert) { + int lastMatch = 0; + for (int j = 0; j < lineLength;) { + int len; + if (subsearch_matches(&line, j, needle, &len)) { + ret = 0; + if (counts) { + count++; + break; + } + if (quiet) goto _done; + if (only_matching) { + if (showFilenames) fprintf(stdout, "%s:", filename); + fprintf(stdout, "%.*s\n", len, buf + j); + } else { + if (lastMatch == 0 && showFilenames) fprintf(stdout, "%s:", filename); + fprintf(stdout, "%.*s%s%.*s%s", + j - lastMatch, + buf + lastMatch, + is_tty ? "\033[1;31m" : "", + len, + buf + j, + is_tty ? "\033[0m" : ""); + } + lastMatch = j + len; + j = lastMatch; + } else { + j++; + } + } + if (counts) continue; + if (!only_matching && lastMatch) { + fprintf(stdout, "%s", buf + lastMatch); + } + } else { + int matched = 0; + for (int j = 0; j < lineLength; ++j) { + if (subsearch_matches(&line, j, needle, NULL)) { + matched = 1; + break; + } + } + if (matched) continue; + ret = 0; + if (counts) { + count++; + continue; + } + if (quiet) goto _done; + if (showFilenames) fprintf(stdout, "%s:", filename); + fprintf(stdout, "%s", buf); + } + } + +_done: (void)0; + if (input != stdin) fclose(input); + + if (counts) { + if (showFilenames) fprintf(stdout, "%s:", filename); + fprintf(stdout, "%d\n", count); + } + + optind++; + } while (optind < argc); + + return ret; +} + diff --git a/apps/sh.c b/apps/sh.c index f0d81c2b..a732c0e8 100644 --- a/apps/sh.c +++ b/apps/sh.c @@ -779,7 +779,6 @@ static struct alternative cmd_alternatives[] = { {"ip", ALT_NETIF}, /* Some random other stuff */ - {"grep", "fgrep", "non-regex-capable grep"}, {"more", "bim -", "paging to a text editor"}, {"less", "bim -", "paging to a text editor"},