View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  package org.opencastproject.subtitleparser.webvttparser;
22  
23  import org.opencastproject.subtitleparser.SubtitleParsingException;
24  
25  import org.apache.commons.io.input.BOMInputStream;
26  
27  import java.io.BufferedReader;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.io.InputStreamReader;
31  import java.nio.charset.Charset;
32  import java.nio.charset.StandardCharsets;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  
36  /**
37   * Parses WebVTT from a file into a datastructure to allow for easy modification.
38   * Throws exceptions if the read WebVTT is invalid.
39   *
40   * TODO: Comments are currently ignored and discarded. Find a good way to keep comments
41   *  without compromising easy editing.
42   */
43  public class WebVTTParser {
44  
45    private static final String WEBVTT_METADATA_HEADER_STRING = "\\S*[:=]\\S*";
46    private static final Pattern WEBVTT_METADATA_HEADER =
47            Pattern.compile(WEBVTT_METADATA_HEADER_STRING);
48  
49    // Regex checks if not a time interval
50    private static final String WEBVTT_CUE_IDENTIFIER_STRING = "^(?!.*(-->)).*$";
51    private static final Pattern WEBVTT_CUE_IDENTIFIER =
52            Pattern.compile(WEBVTT_CUE_IDENTIFIER_STRING);
53  
54    // Timestamp from time interval
55    private static final String WEBVTT_TIMESTAMP_STRING = "(\\d+:)?[0-5]\\d:[0-5]\\d\\.\\d{3}";
56    private static final Pattern WEBVTT_TIMESTAMP = Pattern.compile(WEBVTT_TIMESTAMP_STRING);
57  
58    private Charset charset; // Charset of the input files
59  
60    public WebVTTParser() {
61      this.charset = StandardCharsets.UTF_8;
62    }
63  
64    public WebVTTParser(Charset charset) {
65      this.charset = charset;
66    }
67  
68    public WebVTTSubtitle parse(InputStream is) throws IOException, SubtitleParsingException {
69      // Wrap input stream into Apache Commons IO BOMInputStream
70      BOMInputStream bomIn = new BOMInputStream(is, false);
71  
72      // Create subtitle object
73      WebVTTSubtitle subtitle = new WebVTTSubtitle();
74  
75      // Read each line
76      BufferedReader webvttReader = new BufferedReader(new InputStreamReader(bomIn, this.charset));
77      String line = "";
78  
79      // File should start with "WEBVTT" on the first line
80      line = webvttReader.readLine();
81      if (line == null) {
82        throw new SubtitleParsingException("WEBVTT Header line is null");
83      }
84  
85      if (!line.startsWith("WEBVTT")) {
86        throw new SubtitleParsingException("Header line did not start with WEBVTT. Got " + line);
87      }
88  
89      subtitle.addHeaderLine(line);
90  
91      // While this is not mentioned in the W3C specs, it seems to be common practice to have additional lines after
92      // the header containing metadata information on the file.
93      while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
94        subtitle.addHeaderLine(line);
95      }
96  
97      // Process the cues
98      while ((line = webvttReader.readLine()) != null) {
99        WebVTTSubtitleCue cue = new WebVTTSubtitleCue();
100 
101       // Skip additional newlines
102       if (line.isEmpty()) {
103         continue;
104       }
105 
106       if (line.startsWith("REGION")) {
107         WebVTTSubtitleRegion region = new WebVTTSubtitleRegion();
108         region.addLine(line);
109         while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
110           region.addLine(line);
111         }
112         subtitle.addRegion(region);
113         continue;
114       }
115 
116       if (line.startsWith("STYLE")) {
117         WebVTTSubtitleStyle style = new WebVTTSubtitleStyle();
118         style.addLine(line);
119         while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
120           style.addLine(line);
121         }
122         subtitle.addStyle(style);
123         continue;
124       }
125 
126       if (line.startsWith("NOTE")) {
127         while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
128           // do nothing
129         }
130         continue;
131       }
132 
133       // Parse the cue identifier (if present)
134       Matcher matcher = WEBVTT_CUE_IDENTIFIER.matcher(line);
135       if (matcher.find()) {
136         cue.setId(line);
137         line = webvttReader.readLine();
138       }
139 
140       // Parse the cue timestamps
141       matcher = WEBVTT_TIMESTAMP.matcher(line);
142 
143       // Parse start timestamp
144       if (!matcher.find()) {
145         throw new SubtitleParsingException("Expected cue start time: " + line);
146       } else {
147         cue.setStartTime(parseTimestamp(matcher.group()));
148       }
149 
150       // Parse end timestamp
151       if (!matcher.find()) {
152         throw new SubtitleParsingException("Expected cue end time: " + line);
153       } else {
154         cue.setEndTime(parseTimestamp(matcher.group()));
155       }
156 
157       // Parse cue settings list
158       String cueSettings = line.substring(matcher.end()).trim();
159       if (!cueSettings.isEmpty()) {
160         cue.setCueSettingsList(cueSettings);
161       }
162 
163 
164       // Parse text
165       while (((line = webvttReader.readLine()) != null) && (!line.isEmpty())) {
166         cue.addLine(line);
167       }
168 
169       subtitle.addCue(cue);
170     }
171 
172     webvttReader.close();
173     is.close();
174 
175     return subtitle;
176   }
177 
178   private static long parseTimestamp(String s) throws NumberFormatException {
179     if (!s.matches(WEBVTT_TIMESTAMP_STRING)) {
180       throw new NumberFormatException("has invalid format");
181     }
182 
183     String[] parts = s.split("\\.", 2);
184     long value = 0;
185     for (String group : parts[0].split(":")) {
186       value = value * 60 + Long.parseLong(group);
187     }
188     return (value * 1000 + Long.parseLong(parts[1]));
189   }
190 }