I'm trying to parse srt file with javascript. I found some code from stackoverflow but there is an issue. I'm parsing the srt file line by line to recognize the line of subtitle, time and the subtitle text. But when the code read the subtitle text, my code just able to read one line of the subtitle each part whereas some part of the subtitles include 2 or some line.

this is my code

var PF_SRT = function() {
                  //SRT format
                  var pattern = /(\d+)\n([\d:,]+)\s+-{2}\>\s+([\d:,]+)\n([\s\S]*?(?=\n{2}|$))/gm;
                  var _regExp;

                  var init = function() {
                    _regExp = new RegExp(pattern);
                  };
                  var parse = function(f) {
                    if (typeof(f) != "string")
                      throw "Sorry, Parser accept string only.";

                    var result = [];
                    if (f == null)
                      return _subtitles;


                    f = f.replace(/\r\n|\r|\n/g, '\n')


                    while ((matches = pattern.exec(f)) != null) {
                      result.push(toLineObj(matches));
                    }

                    return result;
                  }
                  var toLineObj = function(group) {
                    var hms_start = group[2].replace(',', ':').split(':');   

                    var hms_end = group[3].replace(',', ':').split(':');   

                    return {
                      line: group[1],
                      startTime: (+hms_start[0]) * 60 * 60 + (+hms_start[1]) * 60 + (+hms_start[2]) +'.'+ hms_start[3],
                      endTime: (+hms_end[0]) * 60 * 60 + (+hms_end[1]) * 60 + (+hms_end[2]) +'.'+ hms_end[3],
                      text: group[4]
                    };
                  }
                  init();
                  return {
                    parse: parse
                  }
                }();

// execution
// result is the entire line of srt subtitle file
PF_SRT.parse(result);

I expect the output of

6
00:00:32,616 --> 00:00:41,496
{\a2}{\c&HFFFFFF&}{\fnTahoma} And 23 of them say forget it
you say this thing never worked 
because there's no such thing called internet in the world

to

6
00:00:32,616 --> 00:00:41,496
{\a2}{\c&HFFFFFF&}{\fnTahoma} And 23 of them say forget it<br>you say this thing never worked<br>because there's no such thing called internet in the world

1 Answers

0
mguida On

With this line, you find common new line characters and replace them with a \n new line.

f = f.replace(/\r\n|\r|\n/g, '\n')

You'll need to modify it to also replace HTML line breaks <br>, with a new line character.

For example:

f = f.replace(/\r\n|\r|\n|<br>/g, '\n')