mirror of
https://github.com/lexborisov/Modest
synced 2024-11-21 13:21:54 +03:00
Comment all code in commoncrawl
This commit is contained in:
parent
2960726738
commit
e37c7ce017
@ -21,272 +21,272 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <dirent.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <myhtml/api.h>
|
||||
|
||||
#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
|
||||
|
||||
static myhtml_tree_t* global_tree;
|
||||
|
||||
#define total_count_size 20
|
||||
static size_t total_count[total_count_size];
|
||||
|
||||
typedef void (*process_state_f)(const char* data, size_t filename_size);
|
||||
typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
|
||||
|
||||
void print_total_count(void)
|
||||
{
|
||||
size_t total = 0;
|
||||
for(size_t i = 0; i < 7; i++)
|
||||
total += total_count[i];
|
||||
|
||||
printf("Total: %zu\n" ,total);
|
||||
|
||||
printf("\t0-100: %zu\n", total_count[0]);
|
||||
printf("\t100-1000: %zu\n", total_count[1]);
|
||||
printf("\t1000-5000: %zu\n", total_count[2]);
|
||||
printf("\t5000-10000: %zu\n", total_count[3]);
|
||||
printf("\t10000-50000: %zu\n", total_count[4]);
|
||||
printf("\t50000-100000: %zu\n", total_count[5]);
|
||||
printf("\t100000 and up: %zu\n", total_count[6]);
|
||||
}
|
||||
|
||||
void listdir(const char *name, process_state_f callback)
|
||||
{
|
||||
memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||
|
||||
DIR *dir;
|
||||
struct dirent *entry;
|
||||
|
||||
if(!(dir = opendir(name)))
|
||||
return;
|
||||
if(!(entry = readdir(dir)))
|
||||
return;
|
||||
|
||||
do {
|
||||
if(entry->d_type == DT_DIR) {
|
||||
char path[2048];
|
||||
|
||||
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||
path[len] = '\0';
|
||||
|
||||
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
listdir(path, callback);
|
||||
}
|
||||
else {
|
||||
char path[2048];
|
||||
|
||||
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||
path[len] = '\0';
|
||||
|
||||
if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
|
||||
callback(path, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
while ((entry = readdir(dir)));
|
||||
|
||||
closedir(dir);
|
||||
}
|
||||
|
||||
void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
|
||||
{
|
||||
memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||
|
||||
FILE *fh = fopen(filename, "rb");
|
||||
if(fh == NULL) {
|
||||
fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fseek(fh, 0L, SEEK_END);
|
||||
long size = ftell(fh);
|
||||
fseek(fh, 0L, SEEK_SET);
|
||||
|
||||
char *data = (char*)malloc(size + 1);
|
||||
if(data == NULL) {
|
||||
fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
size_t nread = fread(data, 1, size, fh);
|
||||
if (nread != size) {
|
||||
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fclose(fh);
|
||||
|
||||
if(size < 0)
|
||||
size = 0;
|
||||
|
||||
size_t from = 0;
|
||||
char path[2048];
|
||||
|
||||
for(size_t i = 0; i < size; i++) {
|
||||
if(data[i] == '\n') {
|
||||
int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
|
||||
path[len] = '\0';
|
||||
|
||||
callback(path, len);
|
||||
|
||||
from = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
free(data);
|
||||
}
|
||||
|
||||
void process(const char* filename, size_t filename_size, parser_state_f parser)
|
||||
{
|
||||
FILE *fh = fopen(filename, "rb");
|
||||
if(fh == NULL) {
|
||||
fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fseek(fh, 0L, SEEK_SET);
|
||||
|
||||
const char *ct = "Content-Length:";
|
||||
size_t ct_size = strlen(ct);
|
||||
|
||||
char * line = NULL;
|
||||
long get_size = 0;
|
||||
ssize_t read = 0;
|
||||
|
||||
size_t count = 0, read_len = 0;
|
||||
|
||||
while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||
|
||||
if(strncmp(ct, line, ct_size) == 0) {
|
||||
size_t i;
|
||||
|
||||
for(i = ct_size; i < read_len; i++)
|
||||
if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
|
||||
break;
|
||||
|
||||
get_size = strtol(&line[i], NULL, 0);
|
||||
}
|
||||
else if(get_size && line[0] == '\r' && line[1] == '\n') {
|
||||
long head_begin = ftell(fh) + 2;
|
||||
long end = head_begin + get_size;
|
||||
|
||||
while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||
//printf("%.*s", (int)read_len, line);
|
||||
|
||||
if(line[0] == '\r' && line[1] == '\n')
|
||||
break;
|
||||
}
|
||||
|
||||
long head_end = ftell(fh);
|
||||
|
||||
size_t html_length = (end - head_end);
|
||||
char *html = malloc(html_length + 1);
|
||||
|
||||
size_t nread = fread(html, 1, html_length, fh);
|
||||
if (nread != html_length) {
|
||||
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
count++;
|
||||
parser(html, html_length, count);
|
||||
|
||||
get_size = 0;
|
||||
free(html);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fh);
|
||||
}
|
||||
|
||||
void html_parser(const char* html, size_t html_length, size_t count)
|
||||
{
|
||||
if((count % 1000) == 0) {
|
||||
printf("\t%zu\n", count);
|
||||
}
|
||||
|
||||
myencoding_t encoding = 0;
|
||||
//myhtml_encoding_detect(html, html_length, &encoding);
|
||||
|
||||
// parse html
|
||||
myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
|
||||
if(status != MyHTML_STATUS_OK) {
|
||||
fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if(html_length < 100)
|
||||
total_count[0]++;
|
||||
else if(html_length >= 100 && html_length < 1000)
|
||||
total_count[1]++;
|
||||
else if(html_length >= 1000 && html_length < 5000)
|
||||
total_count[2]++;
|
||||
else if(html_length >= 5000 && html_length < 10000)
|
||||
total_count[3]++;
|
||||
else if(html_length >= 10000 && html_length < 50000)
|
||||
total_count[4]++;
|
||||
else if(html_length >= 50000 && html_length < 100000)
|
||||
total_count[5]++;
|
||||
else if(html_length >= 100000)
|
||||
total_count[6]++;
|
||||
|
||||
//myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
|
||||
}
|
||||
|
||||
void process_unpack(const char* filename, size_t filename_size)
|
||||
{
|
||||
char command[2048];
|
||||
snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
|
||||
|
||||
printf("Unzip %s\n", filename);
|
||||
|
||||
system(command);
|
||||
|
||||
char new_path[2048];
|
||||
size_t new_path_size = (filename_size - 3);
|
||||
|
||||
snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
|
||||
|
||||
printf("Process %s:\n", new_path);
|
||||
process(new_path, new_path_size, html_parser);
|
||||
printf("\n");
|
||||
|
||||
unlink(new_path);
|
||||
}
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
|
||||
}
|
||||
|
||||
//#include <dirent.h>
|
||||
//#include <string.h>
|
||||
//#include <unistd.h>
|
||||
//
|
||||
//#include <myhtml/api.h>
|
||||
//
|
||||
//#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
|
||||
//
|
||||
//static myhtml_tree_t* global_tree;
|
||||
//
|
||||
//#define total_count_size 20
|
||||
//static size_t total_count[total_count_size];
|
||||
//
|
||||
//typedef void (*process_state_f)(const char* data, size_t filename_size);
|
||||
//typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
|
||||
//
|
||||
//void print_total_count(void)
|
||||
//{
|
||||
// size_t total = 0;
|
||||
// for(size_t i = 0; i < 7; i++)
|
||||
// total += total_count[i];
|
||||
//
|
||||
// printf("Total: %zu\n" ,total);
|
||||
//
|
||||
// printf("\t0-100: %zu\n", total_count[0]);
|
||||
// printf("\t100-1000: %zu\n", total_count[1]);
|
||||
// printf("\t1000-5000: %zu\n", total_count[2]);
|
||||
// printf("\t5000-10000: %zu\n", total_count[3]);
|
||||
// printf("\t10000-50000: %zu\n", total_count[4]);
|
||||
// printf("\t50000-100000: %zu\n", total_count[5]);
|
||||
// printf("\t100000 and up: %zu\n", total_count[6]);
|
||||
//}
|
||||
//
|
||||
//void listdir(const char *name, process_state_f callback)
|
||||
//{
|
||||
// memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||
//
|
||||
// DIR *dir;
|
||||
// struct dirent *entry;
|
||||
//
|
||||
// if(!(dir = opendir(name)))
|
||||
// return;
|
||||
// if(!(entry = readdir(dir)))
|
||||
// return;
|
||||
//
|
||||
// do {
|
||||
// if(entry->d_type == DT_DIR) {
|
||||
// char path[2048];
|
||||
//
|
||||
// int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||
// path[len] = '\0';
|
||||
//
|
||||
// if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
|
||||
// continue;
|
||||
//
|
||||
// listdir(path, callback);
|
||||
// }
|
||||
// else {
|
||||
// char path[2048];
|
||||
//
|
||||
// int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||
// path[len] = '\0';
|
||||
//
|
||||
// if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
|
||||
// callback(path, len);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// while ((entry = readdir(dir)));
|
||||
//
|
||||
// closedir(dir);
|
||||
//}
|
||||
//
|
||||
//void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
|
||||
//{
|
||||
// memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||
//
|
||||
// FILE *fh = fopen(filename, "rb");
|
||||
// if(fh == NULL) {
|
||||
// fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||
// exit(EXIT_FAILURE);
|
||||
// }
|
||||
//
|
||||
// fseek(fh, 0L, SEEK_END);
|
||||
// long size = ftell(fh);
|
||||
// fseek(fh, 0L, SEEK_SET);
|
||||
//
|
||||
// char *data = (char*)malloc(size + 1);
|
||||
// if(data == NULL) {
|
||||
// fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
|
||||
// exit(EXIT_FAILURE);
|
||||
// }
|
||||
//
|
||||
// size_t nread = fread(data, 1, size, fh);
|
||||
// if (nread != size) {
|
||||
// fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
|
||||
// exit(EXIT_FAILURE);
|
||||
// }
|
||||
//
|
||||
// fclose(fh);
|
||||
//
|
||||
// if(size < 0)
|
||||
// size = 0;
|
||||
//
|
||||
// size_t from = 0;
|
||||
// char path[2048];
|
||||
//
|
||||
// for(size_t i = 0; i < size; i++) {
|
||||
// if(data[i] == '\n') {
|
||||
// int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
|
||||
// path[len] = '\0';
|
||||
//
|
||||
// callback(path, len);
|
||||
//
|
||||
// from = i + 1;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// free(data);
|
||||
//}
|
||||
//
|
||||
//void process(const char* filename, size_t filename_size, parser_state_f parser)
|
||||
//{
|
||||
// FILE *fh = fopen(filename, "rb");
|
||||
// if(fh == NULL) {
|
||||
// fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||
// exit(EXIT_FAILURE);
|
||||
// }
|
||||
//
|
||||
// fseek(fh, 0L, SEEK_SET);
|
||||
//
|
||||
// const char *ct = "Content-Length:";
|
||||
// size_t ct_size = strlen(ct);
|
||||
//
|
||||
// char * line = NULL;
|
||||
// long get_size = 0;
|
||||
// ssize_t read = 0;
|
||||
//
|
||||
// size_t count = 0, read_len = 0;
|
||||
//
|
||||
// while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||
//
|
||||
// if(strncmp(ct, line, ct_size) == 0) {
|
||||
// size_t i;
|
||||
//
|
||||
// for(i = ct_size; i < read_len; i++)
|
||||
// if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
|
||||
// break;
|
||||
//
|
||||
// get_size = strtol(&line[i], NULL, 0);
|
||||
// }
|
||||
// else if(get_size && line[0] == '\r' && line[1] == '\n') {
|
||||
// long head_begin = ftell(fh) + 2;
|
||||
// long end = head_begin + get_size;
|
||||
//
|
||||
// while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||
// //printf("%.*s", (int)read_len, line);
|
||||
//
|
||||
// if(line[0] == '\r' && line[1] == '\n')
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
// long head_end = ftell(fh);
|
||||
//
|
||||
// size_t html_length = (end - head_end);
|
||||
// char *html = malloc(html_length + 1);
|
||||
//
|
||||
// size_t nread = fread(html, 1, html_length, fh);
|
||||
// if (nread != html_length) {
|
||||
// fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
|
||||
// exit(EXIT_FAILURE);
|
||||
// }
|
||||
//
|
||||
// count++;
|
||||
// parser(html, html_length, count);
|
||||
//
|
||||
// get_size = 0;
|
||||
// free(html);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// fclose(fh);
|
||||
//}
|
||||
//
|
||||
//void html_parser(const char* html, size_t html_length, size_t count)
|
||||
//{
|
||||
// if((count % 1000) == 0) {
|
||||
// printf("\t%zu\n", count);
|
||||
// }
|
||||
//
|
||||
// myencoding_t encoding = 0;
|
||||
// //myhtml_encoding_detect(html, html_length, &encoding);
|
||||
//
|
||||
// // parse html
|
||||
// myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
|
||||
// if(status != MyHTML_STATUS_OK) {
|
||||
// fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
|
||||
// exit(EXIT_FAILURE);
|
||||
// }
|
||||
//
|
||||
// if(html_length < 100)
|
||||
// total_count[0]++;
|
||||
// else if(html_length >= 100 && html_length < 1000)
|
||||
// total_count[1]++;
|
||||
// else if(html_length >= 1000 && html_length < 5000)
|
||||
// total_count[2]++;
|
||||
// else if(html_length >= 5000 && html_length < 10000)
|
||||
// total_count[3]++;
|
||||
// else if(html_length >= 10000 && html_length < 50000)
|
||||
// total_count[4]++;
|
||||
// else if(html_length >= 50000 && html_length < 100000)
|
||||
// total_count[5]++;
|
||||
// else if(html_length >= 100000)
|
||||
// total_count[6]++;
|
||||
//
|
||||
// //myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
|
||||
//}
|
||||
//
|
||||
//void process_unpack(const char* filename, size_t filename_size)
|
||||
//{
|
||||
// char command[2048];
|
||||
// snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
|
||||
//
|
||||
// printf("Unzip %s\n", filename);
|
||||
//
|
||||
// system(command);
|
||||
//
|
||||
// char new_path[2048];
|
||||
// size_t new_path_size = (filename_size - 3);
|
||||
//
|
||||
// snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
|
||||
//
|
||||
// printf("Process %s:\n", new_path);
|
||||
// process(new_path, new_path_size, html_parser);
|
||||
// printf("\n");
|
||||
//
|
||||
// unlink(new_path);
|
||||
//}
|
||||
//
|
||||
//static void usage(void)
|
||||
//{
|
||||
// fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
|
||||
//}
|
||||
//
|
||||
int main(int argc, const char * argv[])
|
||||
{
|
||||
if (argc != 2) {
|
||||
usage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
// basic init
|
||||
myhtml_t* myhtml = myhtml_create();
|
||||
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
|
||||
|
||||
// first tree init
|
||||
global_tree = myhtml_tree_create();
|
||||
myhtml_tree_init(global_tree, myhtml);
|
||||
|
||||
listdir(argv[1], process_unpack);
|
||||
|
||||
// release resources
|
||||
myhtml_tree_destroy(global_tree);
|
||||
myhtml_destroy(myhtml);
|
||||
|
||||
print_total_count();
|
||||
// if (argc != 2) {
|
||||
// usage();
|
||||
// return 0;
|
||||
// }
|
||||
//
|
||||
// // basic init
|
||||
// myhtml_t* myhtml = myhtml_create();
|
||||
// myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
|
||||
//
|
||||
// // first tree init
|
||||
// global_tree = myhtml_tree_create();
|
||||
// myhtml_tree_init(global_tree, myhtml);
|
||||
//
|
||||
// listdir(argv[1], process_unpack);
|
||||
//
|
||||
// // release resources
|
||||
// myhtml_tree_destroy(global_tree);
|
||||
// myhtml_destroy(myhtml);
|
||||
//
|
||||
// print_total_count();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user