1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.opencastproject.subtitleparser.webvttparser;
22
23 import org.opencastproject.subtitleparser.SubtitleParsingException;
24
25 import org.apache.commons.io.input.BOMInputStream;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.InputStreamReader;
31 import java.nio.charset.Charset;
32 import java.nio.charset.StandardCharsets;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35
36
37
38
39
40
41
42
43 public class WebVTTParser {
44
45 private static final String WEBVTT_METADATA_HEADER_STRING = "\\S*[:=]\\S*";
46 private static final Pattern WEBVTT_METADATA_HEADER =
47 Pattern.compile(WEBVTT_METADATA_HEADER_STRING);
48
49
50 private static final String WEBVTT_CUE_IDENTIFIER_STRING = "^(?!.*(-->)).*$";
51 private static final Pattern WEBVTT_CUE_IDENTIFIER =
52 Pattern.compile(WEBVTT_CUE_IDENTIFIER_STRING);
53
54
55 private static final String WEBVTT_TIMESTAMP_STRING = "(\\d+:)?[0-5]\\d:[0-5]\\d\\.\\d{3}";
56 private static final Pattern WEBVTT_TIMESTAMP = Pattern.compile(WEBVTT_TIMESTAMP_STRING);
57
58 private Charset charset;
59
60 public WebVTTParser() {
61 this.charset = StandardCharsets.UTF_8;
62 }
63
64 public WebVTTParser(Charset charset) {
65 this.charset = charset;
66 }
67
68 public WebVTTSubtitle parse(InputStream is) throws IOException, SubtitleParsingException {
69
70 BOMInputStream bomIn = new BOMInputStream(is, false);
71
72
73 WebVTTSubtitle subtitle = new WebVTTSubtitle();
74
75
76 BufferedReader webvttReader = new BufferedReader(new InputStreamReader(bomIn, this.charset));
77 String line = "";
78
79
80 line = webvttReader.readLine();
81 if (line == null) {
82 throw new SubtitleParsingException("WEBVTT Header line is null");
83 }
84
85 if (!line.startsWith("WEBVTT")) {
86 throw new SubtitleParsingException("Header line did not start with WEBVTT. Got " + line);
87 }
88
89 subtitle.addHeaderLine(line);
90
91
92
93 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
94 subtitle.addHeaderLine(line);
95 }
96
97
98 while ((line = webvttReader.readLine()) != null) {
99 WebVTTSubtitleCue cue = new WebVTTSubtitleCue();
100
101
102 if (line.isEmpty()) {
103 continue;
104 }
105
106 if (line.startsWith("REGION")) {
107 WebVTTSubtitleRegion region = new WebVTTSubtitleRegion();
108 region.addLine(line);
109 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
110 region.addLine(line);
111 }
112 subtitle.addRegion(region);
113 continue;
114 }
115
116 if (line.startsWith("STYLE")) {
117 WebVTTSubtitleStyle style = new WebVTTSubtitleStyle();
118 style.addLine(line);
119 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
120 style.addLine(line);
121 }
122 subtitle.addStyle(style);
123 continue;
124 }
125
126 if (line.startsWith("NOTE")) {
127 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
128
129 }
130 continue;
131 }
132
133
134 Matcher matcher = WEBVTT_CUE_IDENTIFIER.matcher(line);
135 if (matcher.find()) {
136 cue.setId(line);
137 line = webvttReader.readLine();
138 }
139
140
141 matcher = WEBVTT_TIMESTAMP.matcher(line);
142
143
144 if (!matcher.find()) {
145 throw new SubtitleParsingException("Expected cue start time: " + line);
146 } else {
147 cue.setStartTime(parseTimestamp(matcher.group()));
148 }
149
150
151 if (!matcher.find()) {
152 throw new SubtitleParsingException("Expected cue end time: " + line);
153 } else {
154 cue.setEndTime(parseTimestamp(matcher.group()));
155 }
156
157
158 String cueSettings = line.substring(matcher.end()).trim();
159 if (!cueSettings.isEmpty()) {
160 cue.setCueSettingsList(cueSettings);
161 }
162
163
164
165 while (((line = webvttReader.readLine()) != null) && (!line.isEmpty())) {
166 cue.addLine(line);
167 }
168
169 subtitle.addCue(cue);
170 }
171
172 webvttReader.close();
173 is.close();
174
175 return subtitle;
176 }
177
178 private static long parseTimestamp(String s) throws NumberFormatException {
179 if (!s.matches(WEBVTT_TIMESTAMP_STRING)) {
180 throw new NumberFormatException("has invalid format");
181 }
182
183 String[] parts = s.split("\\.", 2);
184 long value = 0;
185 for (String group : parts[0].split(":")) {
186 value = value * 60 + Long.parseLong(group);
187 }
188 return (value * 1000 + Long.parseLong(parts[1]));
189 }
190 }