-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.js
152 lines (124 loc) · 4.53 KB
/
parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/**
* got sick of seeing loads of over complicated srt modules on npm that kept breaking because they had loads of dependencies
* so decided to make my own. aiming for simple to the point approach.
* give srt. returns json. but can also return plain text of lines.
*/
var fs = require('fs');
//open srt file into and saves into a string
// console.log(srtFile)
/**
* takes in srt file path,
* returns srt json
*/
function parseSrtFile(srtFile, cb){
var srt = fs.readFileSync(srtFile).toString();
parseSrtContent(srt, function(srtJson){
if(cb){cb(srtJson)}
})
}
/*
* takes in srt string (content of a srt file)
* returns srt json
*/
function parseSrtContent(srt, cb){
//TODO: could be refactor to move opening file outside of this function, to that to allow for use case when parsing an srt string. eg Spoken data returns an srt through the API. but not as a file, as a string/
//split srt string into array. where each element it's a line of the srt file.
var srtArray = srt.split("\n")
//define regex for recognising components of the srt file. number. timecodes, words in a line, empty space between lines
//line counter regex
var oneDigit = /^[0-9]+$/;
//timecode regex
// "00:00:06,500 --> 00:00:10,790" there seems to be some cases where the milliseconds have 2 digits
var twoTimeCodes = /\d{2}:\d{2}:\d{2},\d{2,3} --> \d{2}:\d{2}:\d{2},\d{2,3}/
var words = /\w/;
//setup data structure to save results as as array of line objects.
var result = []
// initialise first line object outside of the loop ensure persistency for the different attributes across file lines.
var lineO = {};
//iterate over lines array of the srt file. to identify components of srt lines.
for (var i=0; i< srtArray.length; i++){
//select new line at every iteration of the loop
var line = srtArray[i];
if(oneDigit.test(line)){
lineO.id = line;
}else if (twoTimeCodes.test(line)) {
var timecodes = line.split(" --> ")
lineO.startTime = timecodes[0]
lineO.endTime = timecodes[1]
}else if(words.test(line)){
//if first line already exists
if(lineO.text){
//TODO : these two line breaks could be refactored as optional param libreak true of alse hand having it true by default.
//`"\n"` adds a line break at the end/after the second one of two consecutive lines belonging to same timecode interval.
lineO.text +=line+"\n";
//also save
result.push(lineO)
lineO = {}
//otherwise create/add first line
}else{
//`"\n"` adds a line break two consecutive lines belonging to same timecode interval.
lineO.text =line+"\n";
}//if else first line
}//if
}//for
// console.log(JSON.stringify(result))
if(cb){cb(result)};
}//parseSrt
/*
* takes in an srt file path,
* and returns plain text of text in lines.
* no timecodes.
*/
function parseSrtFileToText(srtF, cb){
parseSrtFile(srtF, function(res){
var result = "";
//gets text attribute from srt json array elements
for(var j=0; j<res.length; j++){
result +=res[j].text;
}
cb(result)
})
}
/*
* takes in an srt string (content of srt file),
* and returns plain text of text in lines.
* no timecodes.
*/
function parseSrtContentToText(srtF, cb){
parseSrtContent(srtF, function(res){
var result = "";
//gets text attribute from srt json array elements
for(var j=0; j<res.length; j++){
result +=res[j].text;
}
cb(result)
})
}
//parse srt file to json
module.exports.parseSrtFileToJson = parseSrtFile;
//parses srt file to text string
module.exports.parseSrtFileToText = parseSrtFileToText;
//parse srt string (content of srt file) to json
module.exports.parseSrtContentToJson = parseSrtContent;
//parses srt string (content of srt file) to text string
module.exports.parseSrtContentToText = parseSrtContentToText
///////////////////Word accurate lines ///////////
var srtJsonToWordLineJson = require('./srtJsonToWordLinesJson.js').convertTowordsLines;
function parseSrtFileToJsonWordsLines(srtFile, cb){
parseSrtFile(srtFile, function(res){
var result = srtJsonToWordLineJson(res);
if(cb){cb(result)}else{return result};
})
}
//
function parseSrtContentToJsonWordsLines(srt, cb){
parseSrtContent(srt, function(srtJson){
// if(cb){cb(srtJson)}
var res = srtJsonToWordLineJson(srtJson);
if(cb){cb(res)}else{return res};
})
}
//
module.exports.parseSrtFileToJsonWordsLines = parseSrtFileToJsonWordsLines;
//
module.exports.parseSrtContentToJsonWordsLines = parseSrtContentToJsonWordsLines