-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.c
141 lines (116 loc) · 4.8 KB
/
extractor.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <ctype.h>
#include <regex.h>
#include <gumbo.h>
#define MIN_CONTENT_LEN 26
#define MAX_PAGE_LEN 64*1024*1024
#define MAX_TITLE_LEN 1024
static int find_title(const GumboNode* root, char *title, size_t length);
static void utf8_count(const char *buffer, size_t *purelen, size_t *len);
static void extract_text(GumboNode* node, char *buffer, size_t *pos);
static int find_title(const GumboNode* root, char *title, size_t length)
{
assert(root->type == GUMBO_NODE_ELEMENT);
assert(root->v.element.children.length >= 2);
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = NULL;
for (int i = 0; i < root_children->length; ++i) {
GumboNode* child = root_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD) {
head = child;
break;
}
}
assert(head != NULL);
GumboVector* head_children = &head->v.element.children;
for (int i = 0; i < head_children->length; ++i) {
GumboNode* child = head_children->data[i];
if (child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_TITLE) {
if (child->v.element.children.length != 1) {
return 0;
}
GumboNode* title_text = child->v.element.children.data[0];
assert(title_text->type == GUMBO_NODE_TEXT || title_text->type == GUMBO_NODE_WHITESPACE);
strncpy(title, title_text->v.text.text, length);
return 1;
}
}
return 0;
}
static void utf8_count(const char *buffer, size_t *purelen, size_t *len)
{
*purelen = *len = 0;
while(*buffer != '\0')
{
if((*buffer & (char)0x80) == 0){
//regard continuous alpha & digit as one word
if(isalpha(*buffer) && isdigit(*buffer)){
while(isalpha(*buffer) && isdigit(*buffer)){
buffer++;
(*len)++;
}
(*purelen)++;
}
else{
buffer++;
(*len)++;
}
} else {
int nbit = 0;
char temp = *buffer;
while((temp & (char)0x80) != 0){
nbit++;
temp = temp << 1;
}
buffer += nbit;
(*len)+=nbit;
(*purelen)++;
}
}
}
static void extract_text(GumboNode* node, char *buffer, size_t *pos)
{
if (node->type == GUMBO_NODE_TEXT) {
size_t purelen, len;
utf8_count(node->v.text.text, &purelen, &len);
//ignore short lines
if(purelen >= MIN_CONTENT_LEN && (len + (*pos - *buffer) <= MAX_PAGE_LEN - 1)){
regex_t preg;
assert(!regcomp(&preg, "<[^>]*(>[^<>]*<)*/[^>]*>", REG_EXTENDED|REG_ICASE|REG_NOSUB));
if(REG_NOMATCH == regexec(&preg, node->v.text.text, 0, NULL, 0)){
strcpy(buffer+*pos, node->v.text.text);
*pos += len + 1;
}
regfree(&preg);
}
} else if (node->type == GUMBO_NODE_ELEMENT &&
node->v.element.tag != GUMBO_TAG_SCRIPT &&
node->v.element.tag != GUMBO_TAG_STYLE) {
int i;
GumboVector* children = &node->v.element.children;
for (i = 0; i < children->length; ++i) {
int prev_pos = *pos;
extract_text((GumboNode *)children->data[i], buffer, pos);
if (i != 0 && (*pos - prev_pos) > 0) {
buffer[*pos]='\n';
buffer[*pos + 1] = '\0';
(*pos)++;
}
}
}
}
void extract(GumboOutput* Gout, char *title, char *content_buffer)
{
size_t pos;
title[0] = '\0';
find_title(Gout->root, title, MAX_TITLE_LEN - 1);
title[MAX_TITLE_LEN] = '\0';
pos = 0;
extract_text(Gout->root, content_buffer, &pos);
content_buffer[pos] = '\0';
}