1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.opencastproject.subtitleparser.webvttparser;
22
23 import org.opencastproject.subtitleparser.SubtitleParsingException;
24
25 import java.io.BufferedReader;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.InputStreamReader;
29 import java.nio.charset.Charset;
30 import java.nio.charset.StandardCharsets;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33
34
35
36
37
38
39
40
41 public class WebVTTParser {
42
43 private static final String WEBVTT_METADATA_HEADER_STRING = "\\S*[:=]\\S*";
44 private static final Pattern WEBVTT_METADATA_HEADER =
45 Pattern.compile(WEBVTT_METADATA_HEADER_STRING);
46
47
48 private static final String WEBVTT_CUE_IDENTIFIER_STRING = "^(?!.*(-->)).*$";
49 private static final Pattern WEBVTT_CUE_IDENTIFIER =
50 Pattern.compile(WEBVTT_CUE_IDENTIFIER_STRING);
51
52
53 private static final String WEBVTT_TIMESTAMP_STRING = "(\\d+:)?[0-5]\\d:[0-5]\\d\\.\\d{3}";
54 private static final Pattern WEBVTT_TIMESTAMP = Pattern.compile(WEBVTT_TIMESTAMP_STRING);
55
56 private Charset charset;
57
58 public WebVTTParser() {
59 this.charset = StandardCharsets.UTF_8;
60 }
61
62 public WebVTTParser(Charset charset) {
63 this.charset = charset;
64 }
65
66 public WebVTTSubtitle parse(InputStream is) throws IOException, SubtitleParsingException {
67
68 WebVTTSubtitle subtitle = new WebVTTSubtitle();
69
70
71 BufferedReader webvttReader = new BufferedReader(new InputStreamReader(is, this.charset));
72 String line = "";
73
74
75 line = webvttReader.readLine();
76 if (line == null) {
77 throw new SubtitleParsingException("WEBVTT Header line is null");
78 }
79
80 if (!line.startsWith("WEBVTT")) {
81 throw new SubtitleParsingException("Header line did not start with WEBVTT. Got " + line);
82 }
83
84 subtitle.addHeaderLine(line);
85
86
87
88 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
89 subtitle.addHeaderLine(line);
90 }
91
92
93 while ((line = webvttReader.readLine()) != null) {
94 WebVTTSubtitleCue cue = new WebVTTSubtitleCue();
95
96
97 if (line.isEmpty()) {
98 continue;
99 }
100
101 if (line.startsWith("REGION")) {
102 WebVTTSubtitleRegion region = new WebVTTSubtitleRegion();
103 region.addLine(line);
104 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
105 region.addLine(line);
106 }
107 subtitle.addRegion(region);
108 continue;
109 }
110
111 if (line.startsWith("STYLE")) {
112 WebVTTSubtitleStyle style = new WebVTTSubtitleStyle();
113 style.addLine(line);
114 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
115 style.addLine(line);
116 }
117 subtitle.addStyle(style);
118 continue;
119 }
120
121 if (line.startsWith("NOTE")) {
122 while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
123
124 }
125 continue;
126 }
127
128
129 Matcher matcher = WEBVTT_CUE_IDENTIFIER.matcher(line);
130 if (matcher.find()) {
131 cue.setId(line);
132 line = webvttReader.readLine();
133 }
134
135
136 matcher = WEBVTT_TIMESTAMP.matcher(line);
137
138
139 if (!matcher.find()) {
140 throw new SubtitleParsingException("Expected cue start time: " + line);
141 } else {
142 cue.setStartTime(parseTimestamp(matcher.group()));
143 }
144
145
146 if (!matcher.find()) {
147 throw new SubtitleParsingException("Expected cue end time: " + line);
148 } else {
149 cue.setEndTime(parseTimestamp(matcher.group()));
150 }
151
152
153 String cueSettings = line.substring(matcher.end()).trim();
154 if (!cueSettings.isEmpty()) {
155 cue.setCueSettingsList(cueSettings);
156 }
157
158
159
160 while (((line = webvttReader.readLine()) != null) && (!line.isEmpty())) {
161 cue.addLine(line);
162 }
163
164 subtitle.addCue(cue);
165 }
166
167 webvttReader.close();
168 is.close();
169
170 return subtitle;
171 }
172
173 private static long parseTimestamp(String s) throws NumberFormatException {
174 if (!s.matches(WEBVTT_TIMESTAMP_STRING)) {
175 throw new NumberFormatException("has invalid format");
176 }
177
178 String[] parts = s.split("\\.", 2);
179 long value = 0;
180 for (String group : parts[0].split(":")) {
181 value = value * 60 + Long.parseLong(group);
182 }
183 return (value * 1000 + Long.parseLong(parts[1]));
184 }
185 }