mirror of
https://github.com/lexborisov/Modest
synced 2024-11-25 07:09:35 +03:00
Comment all code in commoncrawl
This commit is contained in:
parent
2960726738
commit
e37c7ce017
@ -21,272 +21,272 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <dirent.h>
|
//#include <dirent.h>
|
||||||
#include <string.h>
|
//#include <string.h>
|
||||||
#include <unistd.h>
|
//#include <unistd.h>
|
||||||
|
//
|
||||||
#include <myhtml/api.h>
|
//#include <myhtml/api.h>
|
||||||
|
//
|
||||||
#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
|
//#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
|
||||||
|
//
|
||||||
static myhtml_tree_t* global_tree;
|
//static myhtml_tree_t* global_tree;
|
||||||
|
//
|
||||||
#define total_count_size 20
|
//#define total_count_size 20
|
||||||
static size_t total_count[total_count_size];
|
//static size_t total_count[total_count_size];
|
||||||
|
//
|
||||||
typedef void (*process_state_f)(const char* data, size_t filename_size);
|
//typedef void (*process_state_f)(const char* data, size_t filename_size);
|
||||||
typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
|
//typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
|
||||||
|
//
|
||||||
void print_total_count(void)
|
//void print_total_count(void)
|
||||||
{
|
//{
|
||||||
size_t total = 0;
|
// size_t total = 0;
|
||||||
for(size_t i = 0; i < 7; i++)
|
// for(size_t i = 0; i < 7; i++)
|
||||||
total += total_count[i];
|
// total += total_count[i];
|
||||||
|
//
|
||||||
printf("Total: %zu\n" ,total);
|
// printf("Total: %zu\n" ,total);
|
||||||
|
//
|
||||||
printf("\t0-100: %zu\n", total_count[0]);
|
// printf("\t0-100: %zu\n", total_count[0]);
|
||||||
printf("\t100-1000: %zu\n", total_count[1]);
|
// printf("\t100-1000: %zu\n", total_count[1]);
|
||||||
printf("\t1000-5000: %zu\n", total_count[2]);
|
// printf("\t1000-5000: %zu\n", total_count[2]);
|
||||||
printf("\t5000-10000: %zu\n", total_count[3]);
|
// printf("\t5000-10000: %zu\n", total_count[3]);
|
||||||
printf("\t10000-50000: %zu\n", total_count[4]);
|
// printf("\t10000-50000: %zu\n", total_count[4]);
|
||||||
printf("\t50000-100000: %zu\n", total_count[5]);
|
// printf("\t50000-100000: %zu\n", total_count[5]);
|
||||||
printf("\t100000 and up: %zu\n", total_count[6]);
|
// printf("\t100000 and up: %zu\n", total_count[6]);
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
void listdir(const char *name, process_state_f callback)
|
//void listdir(const char *name, process_state_f callback)
|
||||||
{
|
//{
|
||||||
memset(total_count, 0, sizeof(size_t) * total_count_size);
|
// memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||||
|
//
|
||||||
DIR *dir;
|
// DIR *dir;
|
||||||
struct dirent *entry;
|
// struct dirent *entry;
|
||||||
|
//
|
||||||
if(!(dir = opendir(name)))
|
// if(!(dir = opendir(name)))
|
||||||
return;
|
// return;
|
||||||
if(!(entry = readdir(dir)))
|
// if(!(entry = readdir(dir)))
|
||||||
return;
|
// return;
|
||||||
|
//
|
||||||
do {
|
// do {
|
||||||
if(entry->d_type == DT_DIR) {
|
// if(entry->d_type == DT_DIR) {
|
||||||
char path[2048];
|
// char path[2048];
|
||||||
|
//
|
||||||
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
// int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||||
path[len] = '\0';
|
// path[len] = '\0';
|
||||||
|
//
|
||||||
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
|
// if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
|
||||||
continue;
|
// continue;
|
||||||
|
//
|
||||||
listdir(path, callback);
|
// listdir(path, callback);
|
||||||
}
|
// }
|
||||||
else {
|
// else {
|
||||||
char path[2048];
|
// char path[2048];
|
||||||
|
//
|
||||||
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
// int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||||
path[len] = '\0';
|
// path[len] = '\0';
|
||||||
|
//
|
||||||
if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
|
// if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
|
||||||
callback(path, len);
|
// callback(path, len);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
while ((entry = readdir(dir)));
|
// while ((entry = readdir(dir)));
|
||||||
|
//
|
||||||
closedir(dir);
|
// closedir(dir);
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
|
//void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
|
||||||
{
|
//{
|
||||||
memset(total_count, 0, sizeof(size_t) * total_count_size);
|
// memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||||
|
//
|
||||||
FILE *fh = fopen(filename, "rb");
|
// FILE *fh = fopen(filename, "rb");
|
||||||
if(fh == NULL) {
|
// if(fh == NULL) {
|
||||||
fprintf(stderr, "Can't open html file: %s\n", filename);
|
// fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||||
exit(EXIT_FAILURE);
|
// exit(EXIT_FAILURE);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
fseek(fh, 0L, SEEK_END);
|
// fseek(fh, 0L, SEEK_END);
|
||||||
long size = ftell(fh);
|
// long size = ftell(fh);
|
||||||
fseek(fh, 0L, SEEK_SET);
|
// fseek(fh, 0L, SEEK_SET);
|
||||||
|
//
|
||||||
char *data = (char*)malloc(size + 1);
|
// char *data = (char*)malloc(size + 1);
|
||||||
if(data == NULL) {
|
// if(data == NULL) {
|
||||||
fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
|
// fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
|
||||||
exit(EXIT_FAILURE);
|
// exit(EXIT_FAILURE);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
size_t nread = fread(data, 1, size, fh);
|
// size_t nread = fread(data, 1, size, fh);
|
||||||
if (nread != size) {
|
// if (nread != size) {
|
||||||
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
|
// fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
|
||||||
exit(EXIT_FAILURE);
|
// exit(EXIT_FAILURE);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
fclose(fh);
|
// fclose(fh);
|
||||||
|
//
|
||||||
if(size < 0)
|
// if(size < 0)
|
||||||
size = 0;
|
// size = 0;
|
||||||
|
//
|
||||||
size_t from = 0;
|
// size_t from = 0;
|
||||||
char path[2048];
|
// char path[2048];
|
||||||
|
//
|
||||||
for(size_t i = 0; i < size; i++) {
|
// for(size_t i = 0; i < size; i++) {
|
||||||
if(data[i] == '\n') {
|
// if(data[i] == '\n') {
|
||||||
int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
|
// int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
|
||||||
path[len] = '\0';
|
// path[len] = '\0';
|
||||||
|
//
|
||||||
callback(path, len);
|
// callback(path, len);
|
||||||
|
//
|
||||||
from = i + 1;
|
// from = i + 1;
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
free(data);
|
// free(data);
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
void process(const char* filename, size_t filename_size, parser_state_f parser)
|
//void process(const char* filename, size_t filename_size, parser_state_f parser)
|
||||||
{
|
//{
|
||||||
FILE *fh = fopen(filename, "rb");
|
// FILE *fh = fopen(filename, "rb");
|
||||||
if(fh == NULL) {
|
// if(fh == NULL) {
|
||||||
fprintf(stderr, "Can't open html file: %s\n", filename);
|
// fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||||
exit(EXIT_FAILURE);
|
// exit(EXIT_FAILURE);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
fseek(fh, 0L, SEEK_SET);
|
// fseek(fh, 0L, SEEK_SET);
|
||||||
|
//
|
||||||
const char *ct = "Content-Length:";
|
// const char *ct = "Content-Length:";
|
||||||
size_t ct_size = strlen(ct);
|
// size_t ct_size = strlen(ct);
|
||||||
|
//
|
||||||
char * line = NULL;
|
// char * line = NULL;
|
||||||
long get_size = 0;
|
// long get_size = 0;
|
||||||
ssize_t read = 0;
|
// ssize_t read = 0;
|
||||||
|
//
|
||||||
size_t count = 0, read_len = 0;
|
// size_t count = 0, read_len = 0;
|
||||||
|
//
|
||||||
while ((read = getline(&line, &read_len, fh)) != -1) {
|
// while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||||
|
//
|
||||||
if(strncmp(ct, line, ct_size) == 0) {
|
// if(strncmp(ct, line, ct_size) == 0) {
|
||||||
size_t i;
|
// size_t i;
|
||||||
|
//
|
||||||
for(i = ct_size; i < read_len; i++)
|
// for(i = ct_size; i < read_len; i++)
|
||||||
if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
|
// if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
|
||||||
break;
|
// break;
|
||||||
|
//
|
||||||
get_size = strtol(&line[i], NULL, 0);
|
// get_size = strtol(&line[i], NULL, 0);
|
||||||
}
|
// }
|
||||||
else if(get_size && line[0] == '\r' && line[1] == '\n') {
|
// else if(get_size && line[0] == '\r' && line[1] == '\n') {
|
||||||
long head_begin = ftell(fh) + 2;
|
// long head_begin = ftell(fh) + 2;
|
||||||
long end = head_begin + get_size;
|
// long end = head_begin + get_size;
|
||||||
|
//
|
||||||
while ((read = getline(&line, &read_len, fh)) != -1) {
|
// while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||||
//printf("%.*s", (int)read_len, line);
|
// //printf("%.*s", (int)read_len, line);
|
||||||
|
//
|
||||||
if(line[0] == '\r' && line[1] == '\n')
|
// if(line[0] == '\r' && line[1] == '\n')
|
||||||
break;
|
// break;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
long head_end = ftell(fh);
|
// long head_end = ftell(fh);
|
||||||
|
//
|
||||||
size_t html_length = (end - head_end);
|
// size_t html_length = (end - head_end);
|
||||||
char *html = malloc(html_length + 1);
|
// char *html = malloc(html_length + 1);
|
||||||
|
//
|
||||||
size_t nread = fread(html, 1, html_length, fh);
|
// size_t nread = fread(html, 1, html_length, fh);
|
||||||
if (nread != html_length) {
|
// if (nread != html_length) {
|
||||||
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
|
// fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
|
||||||
exit(EXIT_FAILURE);
|
// exit(EXIT_FAILURE);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
count++;
|
// count++;
|
||||||
parser(html, html_length, count);
|
// parser(html, html_length, count);
|
||||||
|
//
|
||||||
get_size = 0;
|
// get_size = 0;
|
||||||
free(html);
|
// free(html);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
fclose(fh);
|
// fclose(fh);
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
void html_parser(const char* html, size_t html_length, size_t count)
|
//void html_parser(const char* html, size_t html_length, size_t count)
|
||||||
{
|
//{
|
||||||
if((count % 1000) == 0) {
|
// if((count % 1000) == 0) {
|
||||||
printf("\t%zu\n", count);
|
// printf("\t%zu\n", count);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
myencoding_t encoding = 0;
|
// myencoding_t encoding = 0;
|
||||||
//myhtml_encoding_detect(html, html_length, &encoding);
|
// //myhtml_encoding_detect(html, html_length, &encoding);
|
||||||
|
//
|
||||||
// parse html
|
// // parse html
|
||||||
myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
|
// myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
|
||||||
if(status != MyHTML_STATUS_OK) {
|
// if(status != MyHTML_STATUS_OK) {
|
||||||
fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
|
// fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
|
||||||
exit(EXIT_FAILURE);
|
// exit(EXIT_FAILURE);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
if(html_length < 100)
|
// if(html_length < 100)
|
||||||
total_count[0]++;
|
// total_count[0]++;
|
||||||
else if(html_length >= 100 && html_length < 1000)
|
// else if(html_length >= 100 && html_length < 1000)
|
||||||
total_count[1]++;
|
// total_count[1]++;
|
||||||
else if(html_length >= 1000 && html_length < 5000)
|
// else if(html_length >= 1000 && html_length < 5000)
|
||||||
total_count[2]++;
|
// total_count[2]++;
|
||||||
else if(html_length >= 5000 && html_length < 10000)
|
// else if(html_length >= 5000 && html_length < 10000)
|
||||||
total_count[3]++;
|
// total_count[3]++;
|
||||||
else if(html_length >= 10000 && html_length < 50000)
|
// else if(html_length >= 10000 && html_length < 50000)
|
||||||
total_count[4]++;
|
// total_count[4]++;
|
||||||
else if(html_length >= 50000 && html_length < 100000)
|
// else if(html_length >= 50000 && html_length < 100000)
|
||||||
total_count[5]++;
|
// total_count[5]++;
|
||||||
else if(html_length >= 100000)
|
// else if(html_length >= 100000)
|
||||||
total_count[6]++;
|
// total_count[6]++;
|
||||||
|
//
|
||||||
//myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
|
// //myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
void process_unpack(const char* filename, size_t filename_size)
|
//void process_unpack(const char* filename, size_t filename_size)
|
||||||
{
|
//{
|
||||||
char command[2048];
|
// char command[2048];
|
||||||
snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
|
// snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
|
||||||
|
//
|
||||||
printf("Unzip %s\n", filename);
|
// printf("Unzip %s\n", filename);
|
||||||
|
//
|
||||||
system(command);
|
// system(command);
|
||||||
|
//
|
||||||
char new_path[2048];
|
// char new_path[2048];
|
||||||
size_t new_path_size = (filename_size - 3);
|
// size_t new_path_size = (filename_size - 3);
|
||||||
|
//
|
||||||
snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
|
// snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
|
||||||
|
//
|
||||||
printf("Process %s:\n", new_path);
|
// printf("Process %s:\n", new_path);
|
||||||
process(new_path, new_path_size, html_parser);
|
// process(new_path, new_path_size, html_parser);
|
||||||
printf("\n");
|
// printf("\n");
|
||||||
|
//
|
||||||
unlink(new_path);
|
// unlink(new_path);
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
static void usage(void)
|
//static void usage(void)
|
||||||
{
|
//{
|
||||||
fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
|
// fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
int main(int argc, const char * argv[])
|
int main(int argc, const char * argv[])
|
||||||
{
|
{
|
||||||
if (argc != 2) {
|
// if (argc != 2) {
|
||||||
usage();
|
// usage();
|
||||||
return 0;
|
// return 0;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
// basic init
|
// // basic init
|
||||||
myhtml_t* myhtml = myhtml_create();
|
// myhtml_t* myhtml = myhtml_create();
|
||||||
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
|
// myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
|
||||||
|
//
|
||||||
// first tree init
|
// // first tree init
|
||||||
global_tree = myhtml_tree_create();
|
// global_tree = myhtml_tree_create();
|
||||||
myhtml_tree_init(global_tree, myhtml);
|
// myhtml_tree_init(global_tree, myhtml);
|
||||||
|
//
|
||||||
listdir(argv[1], process_unpack);
|
// listdir(argv[1], process_unpack);
|
||||||
|
//
|
||||||
// release resources
|
// // release resources
|
||||||
myhtml_tree_destroy(global_tree);
|
// myhtml_tree_destroy(global_tree);
|
||||||
myhtml_destroy(myhtml);
|
// myhtml_destroy(myhtml);
|
||||||
|
//
|
||||||
print_total_count();
|
// print_total_count();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user