mirror of
https://github.com/lexborisov/Modest
synced 2024-11-21 13:21:54 +03:00
Added new rule for make (make modules)
This commit is contained in:
parent
3b3625cb1b
commit
2960726738
76
INSTALL.md
76
INSTALL.md
@ -1,51 +1,47 @@
|
||||
# Modest: Build and Installation
|
||||
|
||||
## make
|
||||
## GNU Make
|
||||
|
||||
In root directory:
|
||||
In root directory of project (`/`):
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
If successful copy lib/* and include/* at the right place for you
|
||||
|
||||
Flags that can be passed to make:
|
||||
- `MODEST_OPTIMIZATION_LEVEL=-O2` set compiler optimization level. Default: -O2
|
||||
- `MODEST_BUILD_WITHOUT_THREADS=YES` build without POSIX Threads. Default: NO
|
||||
|
||||
*for example*
|
||||
```bash
|
||||
make MODEST_BUILD_WITHOUT_THREADS=NO
|
||||
```
|
||||
|
||||
and copy to the right place for you
|
||||
```bash
|
||||
cp lib/* /usr/local/lib
|
||||
cp -r include/* /usr/local/include
|
||||
```
|
||||
|
||||
## cmake
|
||||
|
||||
In `project` directory:
|
||||
```bash
|
||||
cmake .
|
||||
make
|
||||
make test
|
||||
sudo make install
|
||||
```
|
||||
|
||||
Flags that can be passed to CMake:
|
||||
- `MODEST_OPTIMIZATION_LEVEL=-O2` set compiler optimization level. Default: -O2
|
||||
- `CMAKE_INSTALL_LIBDIR=lib` set path to install created library. Default: lib
|
||||
- `MODEST_BUILD_SHARED=ON` build shared library. Default: ON
|
||||
- `MODEST_BUILD_STATIC=ON` build static library. Default: ON
|
||||
- `MODEST_INSTALL_HEADER=OFF` install header files. Default ON
|
||||
- `MODEST_BUILD_WITHOUT_THREADS=YES` build without POSIX Threads. Default: NO
|
||||
- `MODEST_EXTERN_MALLOC=my_malloc_func` set extern malloc function. Default: UNDEFINED
|
||||
- `MODEST_EXTERN_REALLOC=my_realloc_func` set extern realloc function. Default: UNDEFINED
|
||||
- `MODEST_EXTERN_CALLOC=my_calloc_func` set extern calloc function. Default: UNDEFINED
|
||||
- `MODEST_EXTERN_FREE=my_free_func` set extern free function. Default: UNDEFINED
|
||||
Flags that can be passed to make:
|
||||
- `prefix`, default /usr/local
|
||||
- `OS`, if not defined try to get from "uname -s"
|
||||
- `PROJECT_OPTIMIZATION_LEVEL`, default -O2
|
||||
- `MyCORE_BUILD_WITHOUT_THREADS`, YES or (NO or undefined), default undefined
|
||||
- `MyCORE_BUILD_DEBUG`, YES or (NO or undefined), default undefined
|
||||
- `MyCORE_WITH_PERF`, YES or (NO or undefined), default undefined, try build with timers (rdtsc or some), OS dependent, may not work on some systems,
|
||||
- `PROJECT_INSTALL_HEADER`, default "include"
|
||||
- `PROJECT_INSTALL_LIBRARY`, default "lib"
|
||||
- `PROJECT_INSTALL_WITHOUT_HEADERS`, YES or (NO or undefined), default undefined
|
||||
|
||||
*for example*
|
||||
*for example*:
|
||||
```bash
|
||||
cmake . -DCMAKE_INSTALL_LIBDIR=lib64 -DMODEST_INSTALL_HEADER=ON
|
||||
make -j4 prefix=/usr MyCORE_BUILD_WITHOUT_THREADS=YES
|
||||
sudo make install
|
||||
```
|
||||
|
||||
Makefile rules:
|
||||
- `all` —- build all components (libraries, examples, tests) (default)
|
||||
- `library` -- build only static and shared library
|
||||
- `shared` -- build only shared library
|
||||
- `static` -- build only static library
|
||||
- `clean` -- clean up current build directory
|
||||
- `clone` -- copy all headers from source to include directories and modify local include (`#include "..."`) to global (`#include <...>`)
|
||||
- `clean_api` -- remove all headers from include directory
|
||||
- `create` -- create directories for binary, libraries, tests
|
||||
- `test` -- run all tests
|
||||
- `modules` -- print modules name, description, dependencies
|
||||
- `install` -- install libraries and headers on your system
|
||||
- `uninstall` -- delete libraries and headers on your system
|
||||
- `make-pc-file` -- create pkg-config file
|
||||
|
||||
*for example*:
|
||||
```bash
|
||||
make shared
|
||||
```
|
||||
|
40
Makefile
40
Makefile
@ -15,18 +15,19 @@ CC ?= gcc
|
||||
# install -- install libraries and headers on your system
|
||||
# uninstall -- delete libraries and headers on your system
|
||||
# test -- run all tests
|
||||
# modules -- print modules list: Module name, Description, Dependencies
|
||||
# make-pc-file -- create pkg-config file
|
||||
#
|
||||
# ARGS
|
||||
# prefix, default /usr/local
|
||||
# OS, if not defined try to get from "uname -s"
|
||||
# MODEST_OPTIMIZATION_LEVEL, default -O2
|
||||
# PROJECT_OPTIMIZATION_LEVEL, default -O2
|
||||
# MyCORE_BUILD_WITHOUT_THREADS, YES or (NO or undefined), default undefined
|
||||
# MyCORE_BUILD_DEBUG, YES or (NO or undefined), default undefined
|
||||
# MyCORE_WITH_PERF, YES or (NO or undefined), default undefined, try build with timers (rdtsc or some), OS dependent, may not work on some systems,
|
||||
# MODEST_INSTALL_HEADER, default "include"
|
||||
# MODEST_INSTALL_LIBRARY, default "lib"
|
||||
# MODEST_INSTALL_WITHOUT_HEADERS, YES or (NO or undefined), default undefined
|
||||
# PROJECT_INSTALL_HEADER, default "include"
|
||||
# PROJECT_INSTALL_LIBRARY, default "lib"
|
||||
# PROJECT_INSTALL_WITHOUT_HEADERS, YES or (NO or undefined), default undefined
|
||||
#
|
||||
# If OS build rules not exists we try make library with POSIX threads
|
||||
|
||||
@ -76,6 +77,12 @@ MyPORT_SELECTED_PORT = myport/$(strip $(MODEST_PORT_NAME))
|
||||
#***************
|
||||
include $(MODEST_BUILD_MODULES_MAKEFILES_LIST)
|
||||
|
||||
#********************
|
||||
# Modules info
|
||||
#***************
|
||||
MODEST_BUILD_MODULES_INFO_DEP = $(foreach dep,$(strip $($1_dependencies)), $(dep))
|
||||
MODEST_BUILD_MODULES_INFO := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),$(MODEST_UTILS_NEW_LINE)Module: $(name)$(MODEST_UTILS_NEW_LINE)Description: $($(name)_description)$(MODEST_UTILS_NEW_LINE)Dependencies:$(call MODEST_BUILD_MODULES_INFO_DEP,$(name))$(MODEST_UTILS_NEW_LINE))
|
||||
|
||||
#********************
|
||||
# Set ARGS for flags
|
||||
#***************
|
||||
@ -102,17 +109,17 @@ BUILD_SUB_DIRS := examples $(TEST_DIR)
|
||||
#********************
|
||||
# Install
|
||||
#***************
|
||||
MODEST_INSTALL_LIBRARY := lib
|
||||
MODEST_INSTALL_HEADER := include
|
||||
PROJECT_INSTALL_LIBRARY := lib
|
||||
PROJECT_INSTALL_HEADER := include
|
||||
|
||||
libdir ?= $(prefix)/$(MODEST_INSTALL_LIBRARY)
|
||||
includedir ?= $(prefix)/$(MODEST_INSTALL_HEADER)
|
||||
libdir ?= $(prefix)/$(PROJECT_INSTALL_LIBRARY)
|
||||
includedir ?= $(prefix)/$(PROJECT_INSTALL_HEADER)
|
||||
|
||||
MODEST_INSTALL_CREATE_DIR := mkdir -p $(prefix)/$(MODEST_INSTALL_LIBRARY)
|
||||
MODEST_INSTALL_CREATE_DIR := mkdir -p $(prefix)/$(PROJECT_INSTALL_LIBRARY)
|
||||
MODEST_INSTALL_COMMAND := $(MODEST_INSTALL_CREATE_DIR) $(MODEST_UTILS_NEW_LINE) cp -av $(LIB_DIR_BASE)/* $(libdir)
|
||||
|
||||
ifneq ($(MODEST_INSTALL_WITHOUT_HEADERS),YES)
|
||||
MODEST_INSTALL_CREATE_DIR += $(prefix)/$(MODEST_INSTALL_HEADER)
|
||||
ifneq ($(PROJECT_INSTALL_WITHOUT_HEADERS),YES)
|
||||
MODEST_INSTALL_CREATE_DIR += $(prefix)/$(PROJECT_INSTALL_HEADER)
|
||||
MODEST_INSTALL_COMMAND += $(MODEST_UTILS_NEW_LINE) cp -av $(INCLUDE_DIR_API)/* $(includedir)
|
||||
endif
|
||||
|
||||
@ -122,7 +129,7 @@ endif
|
||||
MODEST_UNINSTALL_MK_COMMAND :=
|
||||
MODEST_UNINSTALL_FILE := uninstal.mk
|
||||
|
||||
ifneq ($(MODEST_INSTALL_WITHOUT_HEADERS),YES)
|
||||
ifneq ($(PROJECT_INSTALL_WITHOUT_HEADERS),YES)
|
||||
MODEST_UNINSTALL_HEADERS := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),rm -rf $(includedir)/$(name) \$$(MODEST_UTILS_NEW_LINE))
|
||||
endif
|
||||
|
||||
@ -138,11 +145,11 @@ MODEST_PKG_CONFIG_FILE := modest.pc
|
||||
MODEST_PKG_CONFIG_CFLAGS := $(foreach name,$(MODEST_BUILD_MODULES_LIST_WITHOUT_PORT),-I$\{includedir}/$(name))
|
||||
MODEST_PKG_CONFIG_PROCESS = \
|
||||
$(SED) \
|
||||
-e 's,@version\@,$(MODEST_VERSION_STRING),g' \
|
||||
-e 's,@version\@,$(PROJECT_VERSION_STRING),g' \
|
||||
-e 's,@prefix\@,$(prefix),g' \
|
||||
-e 's,@exec_prefix\@,$(exec_prefix),g' \
|
||||
-e 's,@libdir\@,$(MODEST_INSTALL_LIBRARY),g' \
|
||||
-e 's,@includedir\@,$(MODEST_INSTALL_HEADER),g' \
|
||||
-e 's,@libdir\@,$(PROJECT_INSTALL_LIBRARY),g' \
|
||||
-e 's,@includedir\@,$(PROJECT_INSTALL_HEADER),g' \
|
||||
-e 's,@cflags\@,$(MODEST_PKG_CONFIG_CFLAGS),g' \
|
||||
-e 's,@libname\@,$(LIB_NAME),g' \
|
||||
-e 's,@description\@,$(DESCRIPTION),g' \
|
||||
@ -193,4 +200,7 @@ test: library
|
||||
make-pc-file:
|
||||
$(call MODEST_PKG_CONFIG_PROCESS,$(MODEST_PKG_CONFIG_FILE).in, $(MODEST_PKG_CONFIG_FILE))
|
||||
|
||||
modules:
|
||||
$(info $(MODEST_BUILD_MODULES_INFO))
|
||||
|
||||
.PHONY: all clean clone test $(MODEST_BUILD_MODULES_TARGET_ALL)
|
||||
|
20
Makefile.cfg
20
Makefile.cfg
@ -3,11 +3,11 @@ MODEST_BUILD_OS := UNDEF
|
||||
#********************
|
||||
# Version
|
||||
#***************
|
||||
MODEST_VERSION_MAJOR := 0
|
||||
MODEST_VERSION_MINOR := 0
|
||||
MODEST_VERSION_PATCH := 6
|
||||
PROJECT_VERSION_MAJOR := 0
|
||||
PROJECT_VERSION_MINOR := 0
|
||||
PROJECT_VERSION_PATCH := 6
|
||||
|
||||
MODEST_VERSION_STRING := $(MODEST_VERSION_MAJOR).$(MODEST_VERSION_MINOR).$(MODEST_VERSION_PATCH)
|
||||
PROJECT_VERSION_STRING := $(PROJECT_VERSION_MAJOR).$(PROJECT_VERSION_MINOR).$(PROJECT_VERSION_PATCH)
|
||||
|
||||
#********************
|
||||
# Flags
|
||||
@ -37,14 +37,14 @@ LIB_DIR_BASE := lib
|
||||
# for use actual variables like a LIB_NAME_SUFFIX
|
||||
MODEST_LIBRARY_NAME ?= lib$(LIB_NAME)$(LIB_NAME_SUFFIX)
|
||||
MODEST_LIBRARY_NAME_STATIC ?=lib$(LIB_NAME)$(LIB_NAME_SUFFIX_STATIC)
|
||||
MODEST_LIBRARY_NAME_WITH_VERSION = lib$(LIB_NAME)-$(MODEST_VERSION_STRING)$(LIB_NAME_SUFFIX)
|
||||
MODEST_LIBRARY_NAME_WITH_VERSION = lib$(LIB_NAME)-$(PROJECT_VERSION_STRING)$(LIB_NAME_SUFFIX)
|
||||
|
||||
MODEST_LIBRARY ?= $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME)
|
||||
MODEST_LIBRARY_STATIC ?= $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME_STATIC)
|
||||
MODEST_LIBRARY_WITH_VERSION = $(LIB_DIR_BASE)/$(MODEST_LIBRARY_NAME_WITH_VERSION)
|
||||
|
||||
MODEST_LIBRARY_WITH_VERSION_MAJOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
|
||||
MODEST_LIBRARY_WITH_VERSION_MAJOR_MINOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR).$(MODEST_VERSION_MINOR)$(LIB_NAME_SUFFIX)
|
||||
MODEST_LIBRARY_WITH_VERSION_MAJOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
|
||||
MODEST_LIBRARY_WITH_VERSION_MAJOR_MINOR = $(LIB_DIR_BASE)/lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR).$(PROJECT_VERSION_MINOR)$(LIB_NAME_SUFFIX)
|
||||
|
||||
#********************
|
||||
# Binaries
|
||||
@ -75,7 +75,7 @@ SED ?= sed
|
||||
MODEST_DIR_SEPARATOR ?= /
|
||||
|
||||
# flags
|
||||
MODEST_OPTIMIZATION_LEVEL ?= -O2
|
||||
PROJECT_OPTIMIZATION_LEVEL ?= -O2
|
||||
MODEST_CFLAGS += -I$(INCLUDE_DIR)
|
||||
MODEST_LDFLAGS +=
|
||||
|
||||
@ -89,7 +89,7 @@ MODEST_CLONE_SED_HEADER_COMMAND = find $(INCLUDE_DIR_API) -name "*.h" -exec sed
|
||||
# Set -D
|
||||
#***************
|
||||
ifeq ($(MyCORE_BUILD_DEBUG),YES)
|
||||
override MODEST_OPTIMIZATION_LEVEL :=
|
||||
override PROJECT_OPTIMIZATION_LEVEL :=
|
||||
MODEST_CFLAGS += -g3 -ggdb3 -O0 -fno-omit-frame-pointer -DMyCORE_BUILD_DEBUG
|
||||
endif
|
||||
|
||||
@ -128,7 +128,7 @@ endif # def MODEST_PORT_NAME
|
||||
ifeq ($(MODEST_BUILD_OS),UNDEF)
|
||||
MODEST_CFLAGS += -fPIC
|
||||
MODEST_CFLAGS += -D_POSIX_C_SOURCE=199309L
|
||||
MODEST_CFLAGS += $(MODEST_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
|
||||
MODEST_CFLAGS += $(PROJECT_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
|
||||
|
||||
MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY) $(MODEST_UTILS_NEW_LINE)
|
||||
MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY_WITH_VERSION_MAJOR) $(MODEST_UTILS_NEW_LINE)
|
||||
|
@ -1,6 +1,9 @@
|
||||
modest_dirs := . finder style node layer render utils
|
||||
modest_objs := $(call MODEST_UTILS_OBJS,modest,$(modest_dirs))
|
||||
|
||||
modest_description := calculating, compare, renderer
|
||||
modest_dependencies := mycore mycss myencoding myfont myhtml myport myunicode myurl
|
||||
|
||||
modest_all: $(modest_objs)
|
||||
|
||||
modest_clean:
|
||||
|
@ -1,6 +1,9 @@
|
||||
mycore_dirs := . utils
|
||||
mycore_objs := $(call MODEST_UTILS_OBJS,mycore,$(mycore_dirs))
|
||||
|
||||
mycore_description := base module, it is used by all other modules
|
||||
mycore_dependencies :=
|
||||
|
||||
mycore_all: $(mycore_objs)
|
||||
|
||||
mycore_clean:
|
||||
|
@ -1,6 +1,9 @@
|
||||
mycss_dirs := . selectors namespace media values property declaration
|
||||
mycss_objs := $(call MODEST_UTILS_OBJS,mycss,$(mycss_dirs))
|
||||
|
||||
mycss_description := CSS parser and modules by https://drafts.csswg.org/
|
||||
mycss_dependencies := mycore myencoding myport
|
||||
|
||||
mycss_all: $(mycss_objs)
|
||||
|
||||
mycss_clean:
|
||||
|
@ -1,6 +1,9 @@
|
||||
myencoding_dirs := .
|
||||
myencoding_objs := $(call MODEST_UTILS_OBJS,myencoding,$(myencoding_dirs))
|
||||
|
||||
myencoding_description := work with character encodings, detecting encoding, convert encodings by https://encoding.spec.whatwg.org/
|
||||
myencoding_dependencies := mycore myport
|
||||
|
||||
myencoding_all: $(myencoding_objs)
|
||||
|
||||
myencoding_clean:
|
||||
|
@ -1,6 +1,9 @@
|
||||
myfont_dirs := .
|
||||
myfont_objs := $(call MODEST_UTILS_OBJS,myfont,$(myfont_dirs))
|
||||
|
||||
myfont_description := work with font, metrics, calculating size and more by https://www.microsoft.com/en-us/Typography/SpecificationsOverview.aspx
|
||||
myfont_dependencies := mycore myport
|
||||
|
||||
myfont_all: $(myfont_objs)
|
||||
|
||||
myfont_clean:
|
||||
|
@ -1,6 +1,9 @@
|
||||
myhtml_dirs := .
|
||||
myhtml_objs := $(call MODEST_UTILS_OBJS,myhtml,$(myhtml_dirs))
|
||||
|
||||
myhtml_description := HTML parser by https://html.spec.whatwg.org/multipage/
|
||||
myhtml_dependencies := mycore myencoding myport
|
||||
|
||||
myhtml_all: $(myhtml_objs)
|
||||
|
||||
myhtml_clean:
|
||||
|
@ -8,7 +8,7 @@ ifeq ($(OS),Darwin)
|
||||
MODEST_CLONE_SED_HEADER_COMMAND = find $(INCLUDE_DIR_API) -name "*.h" -exec sed -i '.bak' -E 's/^[ \t]*\#[ \t]*include[ \t]*"([^"]+)"/\#include <\1>/g' {} \;
|
||||
|
||||
MODEST_CFLAGS += -fPIC
|
||||
MODEST_CFLAGS += $(MODEST_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
|
||||
MODEST_CFLAGS += $(PROJECT_OPTIMIZATION_LEVEL) -Wno-unused-variable -Wno-unused-function -std=c99
|
||||
LIB_NAME_SUFFIX := .dylib
|
||||
|
||||
MODEST_BUILD_SHARED_AFTER += ln -sf $(call MODEST_LIBRARY_NAME_WITH_VERSION) $(call MODEST_LIBRARY) $(MODEST_UTILS_NEW_LINE)
|
||||
|
@ -5,7 +5,7 @@ ifeq ($(OS),Windows_NT)
|
||||
LIB_NAME_SUFFIX := .dll
|
||||
LIB_NAME_SUFFIX_STATIC := .dll.a
|
||||
|
||||
MODEST_LIBRARY_NAME_WITH_VERSION := lib$(LIB_NAME)-$(MODEST_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
|
||||
MODEST_LIBRARY_NAME_WITH_VERSION := lib$(LIB_NAME)-$(PROJECT_VERSION_MAJOR)$(LIB_NAME_SUFFIX)
|
||||
|
||||
MODEST_CFLAGS += -Wno-unused-variable -Wno-unused-function -std=c99
|
||||
MODEST_LDFLAGS += -Wl,--out-implib,$(call MODEST_LIBRARY_STATIC)
|
||||
|
@ -1,6 +1,9 @@
|
||||
myunicode_dirs := .
|
||||
myunicode_objs := $(call MODEST_UTILS_OBJS,myunicode,$(myunicode_dirs))
|
||||
|
||||
myunicode_description := unicode normalization, case work and other
|
||||
myunicode_dependencies := mycore myport
|
||||
|
||||
myunicode_all: $(myunicode_objs)
|
||||
|
||||
myunicode_clean:
|
||||
|
@ -1,6 +1,9 @@
|
||||
myurl_dirs := .
|
||||
myurl_objs := $(call MODEST_UTILS_OBJS,myurl,$(myurl_dirs))
|
||||
|
||||
myurl_description := URL parser by https://url.spec.whatwg.org/
|
||||
myurl_dependencies := mycore myport
|
||||
|
||||
myurl_all: $(myurl_objs)
|
||||
|
||||
myurl_clean:
|
||||
|
@ -53,9 +53,9 @@ include $(BINARY_BUILD_MODULES_MAKEFILES_LIST)
|
||||
#********************
|
||||
# Set ARGS for flags
|
||||
#***************
|
||||
CFLAGS += $(BINARY_CFLAGS)
|
||||
LDFLAGS += $(BINARY_LDFLAGS)
|
||||
LDLIBS += $(BINARY_LIBRARIES)
|
||||
override CFLAGS += $(BINARY_CFLAGS)
|
||||
override LDFLAGS += $(BINARY_LDFLAGS)
|
||||
override LDLIBS += $(BINARY_LIBRARIES)
|
||||
|
||||
#********************
|
||||
# Objects
|
||||
|
7
test/myhtml/Makefile.mk
Normal file
7
test/myhtml/Makefile.mk
Normal file
@ -0,0 +1,7 @@
|
||||
myhtml_dirs := .
|
||||
myhtml_objs := $(call BINARY_UTILS_OBJS,myhtml,$(myhtml_dirs))
|
||||
|
||||
myhtml_all: $(myhtml_objs)
|
||||
|
||||
myhtml_clean:
|
||||
rm -f $(myhtml_objs)
|
294
test/myhtml/commoncrawl.c
Normal file
294
test/myhtml/commoncrawl.c
Normal file
@ -0,0 +1,294 @@
|
||||
/*
|
||||
Copyright (C) 2016 Alexander Borisov
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Author: lex.borisov@gmail.com (Alexander Borisov)
|
||||
For HTML Pages from: http://commoncrawl.org/
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <dirent.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <myhtml/api.h>
|
||||
|
||||
#define DIE(msg, ...) do { fprintf(stderr, msg, ##__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
|
||||
|
||||
static myhtml_tree_t* global_tree;
|
||||
|
||||
#define total_count_size 20
|
||||
static size_t total_count[total_count_size];
|
||||
|
||||
typedef void (*process_state_f)(const char* data, size_t filename_size);
|
||||
typedef void (*parser_state_f)(const char* data, size_t filename_size, size_t count);
|
||||
|
||||
void print_total_count(void)
|
||||
{
|
||||
size_t total = 0;
|
||||
for(size_t i = 0; i < 7; i++)
|
||||
total += total_count[i];
|
||||
|
||||
printf("Total: %zu\n" ,total);
|
||||
|
||||
printf("\t0-100: %zu\n", total_count[0]);
|
||||
printf("\t100-1000: %zu\n", total_count[1]);
|
||||
printf("\t1000-5000: %zu\n", total_count[2]);
|
||||
printf("\t5000-10000: %zu\n", total_count[3]);
|
||||
printf("\t10000-50000: %zu\n", total_count[4]);
|
||||
printf("\t50000-100000: %zu\n", total_count[5]);
|
||||
printf("\t100000 and up: %zu\n", total_count[6]);
|
||||
}
|
||||
|
||||
void listdir(const char *name, process_state_f callback)
|
||||
{
|
||||
memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||
|
||||
DIR *dir;
|
||||
struct dirent *entry;
|
||||
|
||||
if(!(dir = opendir(name)))
|
||||
return;
|
||||
if(!(entry = readdir(dir)))
|
||||
return;
|
||||
|
||||
do {
|
||||
if(entry->d_type == DT_DIR) {
|
||||
char path[2048];
|
||||
|
||||
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||
path[len] = '\0';
|
||||
|
||||
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
listdir(path, callback);
|
||||
}
|
||||
else {
|
||||
char path[2048];
|
||||
|
||||
int len = snprintf(path, sizeof(path)-1, "%s/%s", name, entry->d_name);
|
||||
path[len] = '\0';
|
||||
|
||||
if(path[ (len - 3) ] == '.' && path[ (len - 2) ] == 'g' && path[ (len - 1) ] == 'z') {
|
||||
callback(path, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
while ((entry = readdir(dir)));
|
||||
|
||||
closedir(dir);
|
||||
}
|
||||
|
||||
void read_loaded(const char *filename, const char *db_dir, process_state_f callback)
|
||||
{
|
||||
memset(total_count, 0, sizeof(size_t) * total_count_size);
|
||||
|
||||
FILE *fh = fopen(filename, "rb");
|
||||
if(fh == NULL) {
|
||||
fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fseek(fh, 0L, SEEK_END);
|
||||
long size = ftell(fh);
|
||||
fseek(fh, 0L, SEEK_SET);
|
||||
|
||||
char *data = (char*)malloc(size + 1);
|
||||
if(data == NULL) {
|
||||
fprintf(stderr, "Can't allocate mem for html file: %s\n", filename);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
size_t nread = fread(data, 1, size, fh);
|
||||
if (nread != size) {
|
||||
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", size, nread);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fclose(fh);
|
||||
|
||||
if(size < 0)
|
||||
size = 0;
|
||||
|
||||
size_t from = 0;
|
||||
char path[2048];
|
||||
|
||||
for(size_t i = 0; i < size; i++) {
|
||||
if(data[i] == '\n') {
|
||||
int len = snprintf(path, sizeof(path)-1, "%s/%.*s", db_dir, (int)(i - from), &data[from]);
|
||||
path[len] = '\0';
|
||||
|
||||
callback(path, len);
|
||||
|
||||
from = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
free(data);
|
||||
}
|
||||
|
||||
void process(const char* filename, size_t filename_size, parser_state_f parser)
|
||||
{
|
||||
FILE *fh = fopen(filename, "rb");
|
||||
if(fh == NULL) {
|
||||
fprintf(stderr, "Can't open html file: %s\n", filename);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
fseek(fh, 0L, SEEK_SET);
|
||||
|
||||
const char *ct = "Content-Length:";
|
||||
size_t ct_size = strlen(ct);
|
||||
|
||||
char * line = NULL;
|
||||
long get_size = 0;
|
||||
ssize_t read = 0;
|
||||
|
||||
size_t count = 0, read_len = 0;
|
||||
|
||||
while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||
|
||||
if(strncmp(ct, line, ct_size) == 0) {
|
||||
size_t i;
|
||||
|
||||
for(i = ct_size; i < read_len; i++)
|
||||
if(line[i] != '\n' && line[i] != '\r' && line[i] != ' ')
|
||||
break;
|
||||
|
||||
get_size = strtol(&line[i], NULL, 0);
|
||||
}
|
||||
else if(get_size && line[0] == '\r' && line[1] == '\n') {
|
||||
long head_begin = ftell(fh) + 2;
|
||||
long end = head_begin + get_size;
|
||||
|
||||
while ((read = getline(&line, &read_len, fh)) != -1) {
|
||||
//printf("%.*s", (int)read_len, line);
|
||||
|
||||
if(line[0] == '\r' && line[1] == '\n')
|
||||
break;
|
||||
}
|
||||
|
||||
long head_end = ftell(fh);
|
||||
|
||||
size_t html_length = (end - head_end);
|
||||
char *html = malloc(html_length + 1);
|
||||
|
||||
size_t nread = fread(html, 1, html_length, fh);
|
||||
if (nread != html_length) {
|
||||
fprintf(stderr, "could not read %ld bytes (%zu bytes done)\n", html_length, nread);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
count++;
|
||||
parser(html, html_length, count);
|
||||
|
||||
get_size = 0;
|
||||
free(html);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fh);
|
||||
}
|
||||
|
||||
void html_parser(const char* html, size_t html_length, size_t count)
|
||||
{
|
||||
if((count % 1000) == 0) {
|
||||
printf("\t%zu\n", count);
|
||||
}
|
||||
|
||||
myencoding_t encoding = 0;
|
||||
//myhtml_encoding_detect(html, html_length, &encoding);
|
||||
|
||||
// parse html
|
||||
myhtml_status_t status = myhtml_parse(global_tree, encoding, html, html_length);
|
||||
if(status != MyHTML_STATUS_OK) {
|
||||
fprintf(stderr, "Can't parse:\n%.*s\n", (int)html_length, html);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if(html_length < 100)
|
||||
total_count[0]++;
|
||||
else if(html_length >= 100 && html_length < 1000)
|
||||
total_count[1]++;
|
||||
else if(html_length >= 1000 && html_length < 5000)
|
||||
total_count[2]++;
|
||||
else if(html_length >= 5000 && html_length < 10000)
|
||||
total_count[3]++;
|
||||
else if(html_length >= 10000 && html_length < 50000)
|
||||
total_count[4]++;
|
||||
else if(html_length >= 50000 && html_length < 100000)
|
||||
total_count[5]++;
|
||||
else if(html_length >= 100000)
|
||||
total_count[6]++;
|
||||
|
||||
//myhtml_tree_print_node_children(global_tree, global_tree->document, stdout, 0);
|
||||
}
|
||||
|
||||
void process_unpack(const char* filename, size_t filename_size)
|
||||
{
|
||||
char command[2048];
|
||||
snprintf(command, sizeof(command)-1, "gzip -k -d %s", filename);
|
||||
|
||||
printf("Unzip %s\n", filename);
|
||||
|
||||
system(command);
|
||||
|
||||
char new_path[2048];
|
||||
size_t new_path_size = (filename_size - 3);
|
||||
|
||||
snprintf(new_path, sizeof(new_path)-1, "%.*s", (int)new_path_size, filename);
|
||||
|
||||
printf("Process %s:\n", new_path);
|
||||
process(new_path, new_path_size, html_parser);
|
||||
printf("\n");
|
||||
|
||||
unlink(new_path);
|
||||
}
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
fprintf(stderr, "commoncrawl <dir with *.warc.gz>\n");
|
||||
}
|
||||
|
||||
int main(int argc, const char * argv[])
|
||||
{
|
||||
if (argc != 2) {
|
||||
usage();
|
||||
return 0;
|
||||
}
|
||||
|
||||
// basic init
|
||||
myhtml_t* myhtml = myhtml_create();
|
||||
myhtml_init(myhtml, MyHTML_OPTIONS_DEFAULT, 1, 0);
|
||||
|
||||
// first tree init
|
||||
global_tree = myhtml_tree_create();
|
||||
myhtml_tree_init(global_tree, myhtml);
|
||||
|
||||
listdir(argv[1], process_unpack);
|
||||
|
||||
// release resources
|
||||
myhtml_tree_destroy(global_tree);
|
||||
myhtml_destroy(myhtml);
|
||||
|
||||
print_total_count();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user