Comment all code in commoncrawl

This commit is contained in:
lexborisov 2017-03-20 22:01:54 +03:00
parent 2960726738
commit e37c7ce017

View File

@ -21,272 +21,272 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <dirent.h> //#include <dirent.h>
#include <string.h> //#include <string.h>
#include <unistd.h> //#include <unistd.h>
//
#include <myhtml/api.h> //#include <myhtml/api.h>
//
#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0) //#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
//
static myhtml_tree_t* global_tree; //static myhtml_tree_t* global_tree;
//
#define total_count_size 20 //#define total_count_size 20
static size_t total_count[total_count_size]; //static size_t total_count[total_count_size];
//
typedef void (*process_state_f)(const char* data, size_t filename_size); //typedef void (*process_state_f)(const char* data, size_t filename_size);
typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count); //typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
//
void print_total_count(void) //void print_total_count(void)
{ //{
size_t total = 0; // size_t total = 0;
for(size_t i = 0; i < 7; i++) // for(size_t i = 0; i < 7; i++)
total += total_count[i]; // total += total_count[i];
//
printf("Total: %zu\n" ,total); // printf("Total: %zu\n" ,total);
//
printf("\t0-100: %zu\n", total_count[0]); // printf("\t0-100: %zu\n", total_count[0]);
printf("\t100-1000: %zu\n", total_count[1]); // printf("\t100-1000: %zu\n", total_count[1]);
printf("\t1000-5000: %zu\n", total_count[2]); // printf("\t1000-5000: %zu\n", total_count[2]);
printf("\t5000-10000: %zu\n", total_count[3]); // printf("\t5000-10000: %zu\n", total_count[3]);
printf("\t10000-50000: %zu\n", total_count[4]); // printf("\t10000-50000: %zu\n", total_count[4]);
printf("\t50000-100000: %zu\n", total_count[5]); // printf("\t50000-100000: %zu\n", total_count[5]);
printf("\t100000 and up: %zu\n", total_count[6]); // printf("\t100000 and up: %zu\n", total_count[6]);
} //}
//
void listdir(const char *name, process_state_f callback) //void listdir(const char *name, process_state_f callback)
{ //{
memset(total_count, 0, sizeof(size_t) * total_count_size); // memset(total_count, 0, sizeof(size_t) * total_count_size);
//
DIR *dir; // DIR *dir;
struct dirent *entry; // struct dirent *entry;
//
if(!(dir = opendir(name))) // if(!(dir = opendir(name)))
return; // return;
if(!(entry = readdir(dir))) // if(!(entry = readdir(dir)))
return; // return;
//
do { // do {
if(entry->d_type == DT_DIR) { // if(entry->d_type == DT_DIR) {
char path[2048]; // char path[2048];
//
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name); // int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
path[len] = '\0'; // path[len] = '\0';
//
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) // if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue; // continue;
//
listdir(path, callback); // listdir(path, callback);
} // }
else { // else {
char path[2048]; // char path[2048];
//
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name); // int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
path[len] = '\0'; // path[len] = '\0';
//
if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') { // if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
callback(path, len); // callback(path, len);
} // }
} // }
} // }
while ((entry = readdir(dir))); // while ((entry = readdir(dir)));
//
closedir(dir); // closedir(dir);
} //}
//
void read_loaded(const char *filename, const char *db_dir, process_state_f callback) //void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
{ //{
memset(total_count, 0, sizeof(size_t) * total_count_size); // memset(total_count, 0, sizeof(size_t) * total_count_size);
//
FILE *fh = fopen(filename, "rb"); // FILE *fh = fopen(filename, "rb");
if(fh == NULL) { // if(fh == NULL) {
fprintf(stderr, "Can't open html file: %s\n", filename); // fprintf(stderr, "Can't open html file: %s\n", filename);
exit(EXIT_FAILURE); // exit(EXIT_FAILURE);
} // }
//
fseek(fh, 0L, SEEK_END); // fseek(fh, 0L, SEEK_END);
long size = ftell(fh); // long size = ftell(fh);
fseek(fh, 0L, SEEK_SET); // fseek(fh, 0L, SEEK_SET);
//
char *data = (char*)malloc(size + 1); // char *data = (char*)malloc(size + 1);
if(data == NULL) { // if(data == NULL) {
fprintf(stderr, "Can't allocate mem for html file: %s\n", filename); // fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
exit(EXIT_FAILURE); // exit(EXIT_FAILURE);
} // }
//
size_t nread = fread(data, 1, size, fh); // size_t nread = fread(data, 1, size, fh);
if (nread != size) { // if (nread != size) {
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread); // fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
exit(EXIT_FAILURE); // exit(EXIT_FAILURE);
} // }
//
fclose(fh); // fclose(fh);
//
if(size < 0) // if(size < 0)
size = 0; // size = 0;
//
size_t from = 0; // size_t from = 0;
char path[2048]; // char path[2048];
//
for(size_t i = 0; i < size; i++) { // for(size_t i = 0; i < size; i++) {
if(data[i] == '\n') { // if(data[i] == '\n') {
int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]); // int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
path[len] = '\0'; // path[len] = '\0';
//
callback(path, len); // callback(path, len);
//
from = i + 1; // from = i + 1;
} // }
} // }
//
free(data); // free(data);
} //}
//
void process(const char* filename, size_t filename_size, parser_state_f parser) //void process(const char* filename, size_t filename_size, parser_state_f parser)
{ //{
FILE *fh = fopen(filename, "rb"); // FILE *fh = fopen(filename, "rb");
if(fh == NULL) { // if(fh == NULL) {
fprintf(stderr, "Can't open html file: %s\n", filename); // fprintf(stderr, "Can't open html file: %s\n", filename);
exit(EXIT_FAILURE); // exit(EXIT_FAILURE);
} // }
//
fseek(fh, 0L, SEEK_SET); // fseek(fh, 0L, SEEK_SET);
//
const char *ct = "Content-Length:"; // const char *ct = "Content-Length:";
size_t ct_size = strlen(ct); // size_t ct_size = strlen(ct);
//
char * line = NULL; // char * line = NULL;
long get_size = 0; // long get_size = 0;
ssize_t read = 0; // ssize_t read = 0;
//
size_t count = 0, read_len = 0; // size_t count = 0, read_len = 0;
//
while ((read = getline(&line, &read_len, fh)) != -1) { // while ((read = getline(&line, &read_len, fh)) != -1) {
//
if(strncmp(ct, line, ct_size) == 0) { // if(strncmp(ct, line, ct_size) == 0) {
size_t i; // size_t i;
//
for(i = ct_size; i < read_len; i++) // for(i = ct_size; i < read_len; i++)
if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ') // if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
break; // break;
//
get_size = strtol(&line[i], NULL, 0); // get_size = strtol(&line[i], NULL, 0);
} // }
else if(get_size && line[0] == '\r' && line[1] == '\n') { // else if(get_size && line[0] == '\r' && line[1] == '\n') {
long head_begin = ftell(fh) + 2; // long head_begin = ftell(fh) + 2;
long end = head_begin + get_size; // long end = head_begin + get_size;
//
while ((read = getline(&line, &read_len, fh)) != -1) { // while ((read = getline(&line, &read_len, fh)) != -1) {
//printf("%.*s", (int)read_len, line); // //printf("%.*s", (int)read_len, line);
//
if(line[0] == '\r' && line[1] == '\n') // if(line[0] == '\r' && line[1] == '\n')
break; // break;
} // }
//
long head_end = ftell(fh); // long head_end = ftell(fh);
//
size_t html_length = (end - head_end); // size_t html_length = (end - head_end);
char *html = malloc(html_length + 1); // char *html = malloc(html_length + 1);
//
size_t nread = fread(html, 1, html_length, fh); // size_t nread = fread(html, 1, html_length, fh);
if (nread != html_length) { // if (nread != html_length) {
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread); // fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
exit(EXIT_FAILURE); // exit(EXIT_FAILURE);
} // }
//
count++; // count++;
parser(html, html_length, count); // parser(html, html_length, count);
//
get_size = 0; // get_size = 0;
free(html); // free(html);
} // }
} // }
//
fclose(fh); // fclose(fh);
} //}
//
void html_parser(const char* html, size_t html_length, size_t count) //void html_parser(const char* html, size_t html_length, size_t count)
{ //{
if((count % 1000) == 0) { // if((count % 1000) == 0) {
printf("\t%zu\n", count); // printf("\t%zu\n", count);
} // }
//
myencoding_t encoding = 0; // myencoding_t encoding = 0;
//myhtml_encoding_detect(html, html_length, &encoding); // //myhtml_encoding_detect(html, html_length, &encoding);
//
// parse html // // parse html
myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length); // myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
if(status != MyHTML_STATUS_OK) { // if(status != MyHTML_STATUS_OK) {
fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html); // fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
exit(EXIT_FAILURE); // exit(EXIT_FAILURE);
} // }
//
if(html_length < 100) // if(html_length < 100)
total_count[0]++; // total_count[0]++;
else if(html_length >= 100 && html_length < 1000) // else if(html_length >= 100 && html_length < 1000)
total_count[1]++; // total_count[1]++;
else if(html_length >= 1000 && html_length < 5000) // else if(html_length >= 1000 && html_length < 5000)
total_count[2]++; // total_count[2]++;
else if(html_length >= 5000 && html_length < 10000) // else if(html_length >= 5000 && html_length < 10000)
total_count[3]++; // total_count[3]++;
else if(html_length >= 10000 && html_length < 50000) // else if(html_length >= 10000 && html_length < 50000)
total_count[4]++; // total_count[4]++;
else if(html_length >= 50000 && html_length < 100000) // else if(html_length >= 50000 && html_length < 100000)
total_count[5]++; // total_count[5]++;
else if(html_length >= 100000) // else if(html_length >= 100000)
total_count[6]++; // total_count[6]++;
//
//myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0); // //myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
} //}
//
void process_unpack(const char* filename, size_t filename_size) //void process_unpack(const char* filename, size_t filename_size)
{ //{
char command[2048]; // char command[2048];
snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename); // snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
//
printf("Unzip %s\n", filename); // printf("Unzip %s\n", filename);
//
system(command); // system(command);
//
char new_path[2048]; // char new_path[2048];
size_t new_path_size = (filename_size - 3); // size_t new_path_size = (filename_size - 3);
//
snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename); // snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
//
printf("Process %s:\n", new_path); // printf("Process %s:\n", new_path);
process(new_path, new_path_size, html_parser); // process(new_path, new_path_size, html_parser);
printf("\n"); // printf("\n");
//
unlink(new_path); // unlink(new_path);
} //}
//
static void usage(void) //static void usage(void)
{ //{
fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n"); // fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
} //}
//
int main(int argc, const char * argv[]) int main(int argc, const char * argv[])
{ {
if (argc != 2) { // if (argc != 2) {
usage(); // usage();
return 0; // return 0;
} // }
//
// basic init // // basic init
myhtml_t* myhtml = myhtml_create(); // myhtml_t* myhtml = myhtml_create();
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0); // myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
//
// first tree init // // first tree init
global_tree = myhtml_tree_create(); // global_tree = myhtml_tree_create();
myhtml_tree_init(global_tree, myhtml); // myhtml_tree_init(global_tree, myhtml);
//
listdir(argv[1], process_unpack); // listdir(argv[1], process_unpack);
//
// release resources // // release resources
myhtml_tree_destroy(global_tree); // myhtml_tree_destroy(global_tree);
myhtml_destroy(myhtml); // myhtml_destroy(myhtml);
//
print_total_count(); // print_total_count();
return 0; return 0;
} }