-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsehtml.js
117 lines (111 loc) · 3.38 KB
/
parsehtml.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
var cleanups=require("./cleanups");
var pb_bjt="";
var replaceEntity=function(content){
return content.replace(/&#(\d+);/g,function(m,m1){
if (!isNaN(parseInt(m1))) {
return String.fromCharCode(parseInt(m1));
} else {
return m;
}
}).replace(/ñ/g,"ñ").replace(/Ñ/g,"Ñ");
}
var getBody=function(content,prefix,fn){
content=content.replace("</P></BODY>","");
var start=content.indexOf("<P>");
var end=content.lastIndexOf("</P>");
var body=content.substring(start,end);
body=replaceEntity(body);
return body;
}
var parsePNum_Note=function(content,fn){
var pcount=0,notegroup,lastPNum=0, lastNNum=0;
return content.replace(/<P>(\d+)\. ?([\S\s]*?)<\/FONT> <\/P>/g,function(m,m1,m2){
var N=parseInt(m1);
if (m1=="1") {
if (pcount) {
notegroup=true;
lastNNum=0;
} else {
notegroup=false;
}
}
pcount++;
if (notegroup && N==lastNNum+1) {
lastNNum=N;
var extra="";
m2=m2.replace(/,? ?\[P T S\.\]/,function(){
extra=' source="pts"';
return "";
}).replace(/,? ? ?\[PTS\] ?/,function(){
extra=' source="pts"';
return "";
});
return '<ndef n="'+m1+'"'+extra+'>'+m2+"</ndef>";
} else if (N==lastPNum+1) {
if (notegroup) notegroup=false;
lastPNum=N;
return '<p n="'+m1+'">'+m2+'</p>';
} else {
lastPNum=N;
console.log("error pnum",lastPNum,lastNNum,m1,notegroup);
return '<p n="'+m1+'" s="?">'+m2+"</p>";
}
});
}
var parsePTSPage=function(content,fn){
return content.replace(/\[PTS Page (\d+)\] \[\\q (\d+)\/\]/g,function(m,m1,m2){
if (parseInt(m1)!==parseInt(m2)) {
console.log("error pts page",fn,m1,m2)
}
return '<pb s="pts" n="'+m2+'"/>';
});
}
var parseVol=function(content,fn){
return content.replace(/<FONT FACE="Times Ext Roman" SIZE=(\d)>\[PTS Vol (.) - \d\] \[\\z (.) \/\] \[\\f (.) \/]<\/FONT> <BR>\n/g,function(m,m1,m2,m3,m4){
return '<VOL type="pts" id="'+m2+m1+'"/>';
}).replace(/<FONT FACE="Times Ext Roman" SIZE=(\d)>\[BJT Vol (.) - \d\] \[\\z (.) \/\] \[\\w (.) \/]<\/FONT> <BR>\n/g,function(m,m1,m2,m3,m4){
return '<VOL type="bjt" id="'+m2+m1+'"/>';
});
}
var parseHeader=function(content,fn){
return content.replace(/<FONT FACE="Times Ext Roman" SIZE=(\d)>(.+?)<\/FONT> <BR>/g,function(m,m1,m2){
return "<H"+m1+">"+m2+"</H"+m1+">";
});
}
var parseSuttaname=function(content,fn){
return content.replace(/<FONT FACE="Times Ext Roman" SIZE=5>(.+?)<\/P>/g,function(m,m1){
return "<sutta>"+m1+"</sutta>";
});
}
var parseBJTPage=function(content,fn){
return content.replace(/<P>\[BJT Page (\d+)\] \[\\x (\d+)\/\]<\/FONT> <\/P>/g,function(m,m1,m2){
if (parseInt(m1)!==parseInt(m2)) {
console.log("error bjt page",fn,m1,m2)
}
return '<pb s="bjt" n="'+m2+'"/>';
}).replace(/\[BJT Page (\d+)\] \[\\x (\d+)\/\]/g,function(m,m1,m2){
if (parseInt(m1)!==parseInt(m2)) {
console.log("error bjt page",fn,m1,m2)
}
return '<pb s="bjt" n="'+m2+'"/>';
})
}
var parseNormalP=function(content,fn){
return content.replace(/<P>([\S\s]+?)<\/FONT> <\/P>/g,function(m,m1){
return "<p>"+m1+"</p>";
});
}
var parseBody=function(body,fn){
body=parseBJTPage(body);
body=parsePTSPage(body);
body=parsePNum_Note(body,fn);
body=parseVol(body,fn);
body=parseSuttaname(body,fn);
body=parseHeader(body,fn);
body=parseNormalP(body);
for (var i=0;i<cleanups.length;i++){
body=body.replace(cleanups[i][0],cleanups[i][1]);
}
return {text:body,links:[]};
}
module.exports={getBody,parseBody};