grep: Add a basic regex-matching grep with some options
This commit is contained in:
parent
74b6dcc8e3
commit
811c033a13
385
apps/grep.c
Normal file
385
apps/grep.c
Normal file
@ -0,0 +1,385 @@
|
||||
/**
|
||||
* @brief grep - mediocre grep
|
||||
*
|
||||
* Based on the regex search matcher in bim.
|
||||
*
|
||||
* @copyright
|
||||
* This file is part of ToaruOS and is released under the terms
|
||||
* of the NCSA / University of Illinois License - see LICENSE.md
|
||||
* Copyright (C) 2022 K. Lange
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <getopt.h>
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
|
||||
#define LINE_SIZE 4096
|
||||
|
||||
static int invert = 0;
|
||||
static int ignorecase = 0;
|
||||
static int quiet = 0;
|
||||
static int only_matching = 0;
|
||||
static int counts = 0;
|
||||
|
||||
struct MatchQualifier {
|
||||
int (*matchFunc)(struct MatchQualifier*,char,int);
|
||||
union {
|
||||
char matchChar;
|
||||
struct {
|
||||
char * start;
|
||||
char * end;
|
||||
} matchSquares;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Helper for handling smart case sensitivity.
|
||||
*/
|
||||
int match_char(struct MatchQualifier * self, char b, int mode) {
|
||||
if (mode == 0) {
|
||||
return self->matchChar == b;
|
||||
} else if (mode == 1) {
|
||||
return tolower(self->matchChar) == tolower(b);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int match_squares(struct MatchQualifier * self, char c, int mode) {
|
||||
char * start = self->matchSquares.start;
|
||||
char * end = self->matchSquares.end;
|
||||
char * t = start;
|
||||
int good = 1;
|
||||
if (*t == '^') { t++; good = 0; }
|
||||
while (t != end) {
|
||||
char test = *t++;
|
||||
if (test == '\\' && *t && strchr("\\]",*t)) {
|
||||
test = *t++;
|
||||
} else if (test == '\\' && *t == 't') {
|
||||
test = '\t'; t++;
|
||||
}
|
||||
|
||||
if (*t == '-') {
|
||||
t++;
|
||||
if (t == end) return 0;
|
||||
char right = *t++;
|
||||
if (right == '\\' && *t && strchr("\\]",*t)) {
|
||||
right = *t++;
|
||||
} else if (right == '\\' && *t == 't') {
|
||||
right = '\t'; t++;
|
||||
}
|
||||
if (mode ? (tolower(c) >= tolower(test) && tolower(c) <= tolower(right)) : (c >= test && c <= right)) return good;
|
||||
} else {
|
||||
if (mode ? (tolower(c) == tolower(test)) : (c == test)) return good;
|
||||
}
|
||||
}
|
||||
return !good;
|
||||
}
|
||||
|
||||
int match_dot(struct MatchQualifier * self, char c, int mode) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct BackRef {
|
||||
int start;
|
||||
int len;
|
||||
uint32_t * copy;
|
||||
};
|
||||
|
||||
struct Line {
|
||||
int actual;
|
||||
char * text;
|
||||
};
|
||||
|
||||
#define MAX_REFS 10
|
||||
int regex_matches(struct Line * line, int j, char * needle, int ignorecase, int *len, char **needleout, int refindex, struct BackRef * refs) {
|
||||
int k = j;
|
||||
char * match = needle;
|
||||
if (*match == '^') {
|
||||
if (j != 0) return 0;
|
||||
match++;
|
||||
}
|
||||
while (k < line->actual + 1) {
|
||||
if (needleout && *match == ')') {
|
||||
*needleout = match + 1;
|
||||
if (len) *len = k - j;
|
||||
return 1;
|
||||
}
|
||||
if (*match == '\0') {
|
||||
if (needleout) return 0;
|
||||
if (len) *len = k - j;
|
||||
return 1;
|
||||
}
|
||||
if (*match == '$') {
|
||||
if (k != line->actual) return 0;
|
||||
match++;
|
||||
continue;
|
||||
}
|
||||
if (k == line->actual) break;
|
||||
|
||||
struct MatchQualifier matcher = {match_char, .matchChar=*match};
|
||||
if (*match == '.') {
|
||||
matcher.matchFunc = match_dot;
|
||||
match++;
|
||||
} else if (*match == '\\' && strchr("$^/\\.[?]*+()",match[1]) != NULL) {
|
||||
matcher.matchChar = match[1];
|
||||
match += 2;
|
||||
} else if (*match == '\\' && match[1] == 't') {
|
||||
matcher.matchChar = '\t';
|
||||
match += 2;
|
||||
} else if (*match == '[') {
|
||||
char * s = match+1;
|
||||
char * e = s;
|
||||
while (*e && *e != ']') {
|
||||
if (*e == '\\' && e[1] == ']') e++;
|
||||
e++;
|
||||
}
|
||||
if (!*e) break; /* fail match on unterminated [] sequence */
|
||||
match = e + 1;
|
||||
matcher.matchFunc = match_squares;
|
||||
matcher.matchSquares.start = s;
|
||||
matcher.matchSquares.end = e;
|
||||
} else if (*match == '(') {
|
||||
match++;
|
||||
int _len;
|
||||
char * newmatch;
|
||||
if (!regex_matches(line, k, match, ignorecase, &_len, &newmatch, 0, NULL)) break;
|
||||
match = newmatch;
|
||||
if (refindex && refindex < MAX_REFS) {
|
||||
refs[refindex].start = k;
|
||||
refs[refindex].len = _len;
|
||||
refindex++;
|
||||
}
|
||||
k += _len;
|
||||
continue;
|
||||
} else {
|
||||
match++;
|
||||
}
|
||||
if (*match == '?') {
|
||||
/* Optional */
|
||||
match++;
|
||||
if (matcher.matchFunc(&matcher, line->text[k], ignorecase)) {
|
||||
int _len;
|
||||
if (regex_matches(line,k+1,match,ignorecase,&_len, needleout, refindex, refs)) {
|
||||
if (len) *len = _len + k + 1 - j;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
} else if (*match == '+' || *match == '*') {
|
||||
/* Must match at least one */
|
||||
if (*match == '+') {
|
||||
if (!matcher.matchFunc(&matcher, line->text[k], ignorecase)) break;
|
||||
k++;
|
||||
}
|
||||
/* Match any */
|
||||
match++;
|
||||
int greedy = 1;
|
||||
if (*match == '?') {
|
||||
/* non-greedy */
|
||||
match++;
|
||||
greedy = 0;
|
||||
}
|
||||
|
||||
int _j = k;
|
||||
while (_j < line->actual + 1) {
|
||||
int _len;
|
||||
if (!greedy && regex_matches(line, _j, match, ignorecase, &_len, needleout, refindex, refs)) {
|
||||
if (len) *len = _len + _j - j;
|
||||
return 1;
|
||||
}
|
||||
if (_j < line->actual && !matcher.matchFunc(&matcher, line->text[_j], ignorecase)) break;
|
||||
_j++;
|
||||
}
|
||||
if (!greedy) return 0;
|
||||
while (_j >= k) {
|
||||
int _len;
|
||||
if (regex_matches(line, _j, match, ignorecase, &_len, needleout, refindex, refs)) {
|
||||
if (len) *len = _len + _j - j;
|
||||
return 1;
|
||||
}
|
||||
_j--;
|
||||
}
|
||||
return 0;
|
||||
} else {
|
||||
if (!matcher.matchFunc(&matcher, line->text[k], ignorecase)) break;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int subsearch_matches(struct Line * line, int j, char * needle, int *len) {
|
||||
return regex_matches(line, j, needle, ignorecase, len, NULL, 0, NULL);
|
||||
}
|
||||
|
||||
int usage(char ** argv) {
|
||||
#define _I "\033[3m"
|
||||
#define _E "\033[0m\n"
|
||||
fprintf(stderr, "usage: %s [-ivqoc] PATTERN [FILE...]\n"
|
||||
"\n"
|
||||
" Supported options:\n"
|
||||
" -c " _I "Instead of printing matches, print counts of matched lines." _E
|
||||
" -i " _I "Ignore case in input and pattern." _E
|
||||
" -o " _I "Print only the matching parts of each line, separating\n"
|
||||
" each match with a line feed." _E
|
||||
" -q " _I "Exit immediately with 0 when a match (or, with -v,\n"
|
||||
" non-match) is found, do not print matches." _E
|
||||
" -v " _I "Invert match - print lines that do not match pattern." _E
|
||||
"\n"
|
||||
" Supported regex syntax:\n"
|
||||
" [abc] " _I "Match one of a set of characters." _E
|
||||
" [a-z] " _I "Match one from a range of characters." _E
|
||||
" (abc) " _I "Match a group; does nothing here, supported for compatibility\n"
|
||||
" with bim and a possible future sed implementation." _E
|
||||
" . " _I "Match any single character." _E
|
||||
" ^ " _I "Match the start of the line." _E
|
||||
" $ " _I "Match the end of the line." _E
|
||||
"\n"
|
||||
" Modifiers (can be combined with [], ., and single characters):\n"
|
||||
" ? " _I "Match optionally" _E
|
||||
" * " _I "Match any number of occurances" _E
|
||||
" + " _I "Match at least one occurance" _E
|
||||
" *? +? " _I "Non-greedy match variants of * and +" _E
|
||||
"\n"
|
||||
" Some characters can be escaped in the pattern with \\.\n"
|
||||
" The regex engine is not Unicode-aware.\n",
|
||||
argv[0]);
|
||||
#undef _I
|
||||
#undef _E
|
||||
return 1;
|
||||
}
|
||||
|
||||
#define LINE_SIZE 4096
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
int opt;
|
||||
while ((opt = getopt(argc, argv, "?hivqoc")) != -1) {
|
||||
switch (opt) {
|
||||
case 'h':
|
||||
case '?':
|
||||
return usage(argv);
|
||||
case 'i':
|
||||
ignorecase = 1;
|
||||
break;
|
||||
case 'v':
|
||||
invert = 1;
|
||||
break;
|
||||
case 'q':
|
||||
quiet = 1;
|
||||
break;
|
||||
case 'o':
|
||||
only_matching = 1;
|
||||
break;
|
||||
case 'c':
|
||||
counts = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (optind == argc) return usage(argv);
|
||||
|
||||
char * needle = argv[optind];
|
||||
char buf[LINE_SIZE];
|
||||
int ret = 1;
|
||||
int is_tty = isatty(STDOUT_FILENO);
|
||||
|
||||
optind++;
|
||||
|
||||
|
||||
int showFilenames = (optind + 1 != argc);
|
||||
|
||||
do {
|
||||
FILE * input = stdin;
|
||||
int count = 0;
|
||||
if (optind < argc && strcmp(argv[optind],"-")) {
|
||||
input = fopen(argv[optind], "r");
|
||||
if (!input) {
|
||||
fprintf(stderr, "%s: %s: %s\n", argv[0], argv[optind], strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const char * filename = input == stdin ? "(standard input)" : argv[optind];
|
||||
|
||||
while (fgets(buf, LINE_SIZE, input)) {
|
||||
int lineLength = strlen(buf);
|
||||
if (lineLength && buf[lineLength-1] == '\n') {
|
||||
lineLength--;
|
||||
}
|
||||
struct Line line = {
|
||||
lineLength,
|
||||
buf
|
||||
};
|
||||
|
||||
if (!invert) {
|
||||
int lastMatch = 0;
|
||||
for (int j = 0; j < lineLength;) {
|
||||
int len;
|
||||
if (subsearch_matches(&line, j, needle, &len)) {
|
||||
ret = 0;
|
||||
if (counts) {
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
if (quiet) goto _done;
|
||||
if (only_matching) {
|
||||
if (showFilenames) fprintf(stdout, "%s:", filename);
|
||||
fprintf(stdout, "%.*s\n", len, buf + j);
|
||||
} else {
|
||||
if (lastMatch == 0 && showFilenames) fprintf(stdout, "%s:", filename);
|
||||
fprintf(stdout, "%.*s%s%.*s%s",
|
||||
j - lastMatch,
|
||||
buf + lastMatch,
|
||||
is_tty ? "\033[1;31m" : "",
|
||||
len,
|
||||
buf + j,
|
||||
is_tty ? "\033[0m" : "");
|
||||
}
|
||||
lastMatch = j + len;
|
||||
j = lastMatch;
|
||||
} else {
|
||||
j++;
|
||||
}
|
||||
}
|
||||
if (counts) continue;
|
||||
if (!only_matching && lastMatch) {
|
||||
fprintf(stdout, "%s", buf + lastMatch);
|
||||
}
|
||||
} else {
|
||||
int matched = 0;
|
||||
for (int j = 0; j < lineLength; ++j) {
|
||||
if (subsearch_matches(&line, j, needle, NULL)) {
|
||||
matched = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (matched) continue;
|
||||
ret = 0;
|
||||
if (counts) {
|
||||
count++;
|
||||
continue;
|
||||
}
|
||||
if (quiet) goto _done;
|
||||
if (showFilenames) fprintf(stdout, "%s:", filename);
|
||||
fprintf(stdout, "%s", buf);
|
||||
}
|
||||
}
|
||||
|
||||
_done: (void)0;
|
||||
if (input != stdin) fclose(input);
|
||||
|
||||
if (counts) {
|
||||
if (showFilenames) fprintf(stdout, "%s:", filename);
|
||||
fprintf(stdout, "%d\n", count);
|
||||
}
|
||||
|
||||
optind++;
|
||||
} while (optind < argc);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user