View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  package org.opencastproject.subtitleparser.webvttparser;
22  
23  import org.opencastproject.subtitleparser.SubtitleParsingException;
24  
25  import java.io.BufferedReader;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.InputStreamReader;
29  import java.nio.charset.Charset;
30  import java.nio.charset.StandardCharsets;
31  import java.util.regex.Matcher;
32  import java.util.regex.Pattern;
33  
34  /**
35   * Parses WebVTT from a file into a datastructure to allow for easy modification.
36   * Throws exceptions if the read WebVTT is invalid.
37   *
38   * TODO: Comments are currently ignored and discarded. Find a good way to keep comments
39   *  without compromising easy editing.
40   */
41  public class WebVTTParser {
42  
43    private static final String WEBVTT_METADATA_HEADER_STRING = "\\S*[:=]\\S*";
44    private static final Pattern WEBVTT_METADATA_HEADER =
45            Pattern.compile(WEBVTT_METADATA_HEADER_STRING);
46  
47    // Regex checks if not a time interval
48    private static final String WEBVTT_CUE_IDENTIFIER_STRING = "^(?!.*(-->)).*$";
49    private static final Pattern WEBVTT_CUE_IDENTIFIER =
50            Pattern.compile(WEBVTT_CUE_IDENTIFIER_STRING);
51  
52    // Timestamp from time interval
53    private static final String WEBVTT_TIMESTAMP_STRING = "(\\d+:)?[0-5]\\d:[0-5]\\d\\.\\d{3}";
54    private static final Pattern WEBVTT_TIMESTAMP = Pattern.compile(WEBVTT_TIMESTAMP_STRING);
55  
56    private Charset charset; // Charset of the input files
57  
58    public WebVTTParser() {
59      this.charset = StandardCharsets.UTF_8;
60    }
61  
62    public WebVTTParser(Charset charset) {
63      this.charset = charset;
64    }
65  
66    public WebVTTSubtitle parse(InputStream is) throws IOException, SubtitleParsingException {
67      // Create subtitle object
68      WebVTTSubtitle subtitle = new WebVTTSubtitle();
69  
70      // Read each line
71      BufferedReader webvttReader = new BufferedReader(new InputStreamReader(is, this.charset));
72      String line = "";
73  
74      // File should start with "WEBVTT" on the first line
75      line = webvttReader.readLine();
76      if (line == null) {
77        throw new SubtitleParsingException("WEBVTT Header line is null");
78      }
79  
80      if (!line.startsWith("WEBVTT")) {
81        throw new SubtitleParsingException("Header line did not start with WEBVTT. Got " + line);
82      }
83  
84      subtitle.addHeaderLine(line);
85  
86      // While this is not mentioned in the W3C specs, it seems to be common practice to have additional lines after
87      // the header containing metadata information on the file.
88      while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
89        subtitle.addHeaderLine(line);
90      }
91  
92      // Process the cues
93      while ((line = webvttReader.readLine()) != null) {
94        WebVTTSubtitleCue cue = new WebVTTSubtitleCue();
95  
96        // Skip additional newlines
97        if (line.isEmpty()) {
98          continue;
99        }
100 
101       if (line.startsWith("REGION")) {
102         WebVTTSubtitleRegion region = new WebVTTSubtitleRegion();
103         region.addLine(line);
104         while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
105           region.addLine(line);
106         }
107         subtitle.addRegion(region);
108         continue;
109       }
110 
111       if (line.startsWith("STYLE")) {
112         WebVTTSubtitleStyle style = new WebVTTSubtitleStyle();
113         style.addLine(line);
114         while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
115           style.addLine(line);
116         }
117         subtitle.addStyle(style);
118         continue;
119       }
120 
121       if (line.startsWith("NOTE")) {
122         while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
123           // do nothing
124         }
125         continue;
126       }
127 
128       // Parse the cue identifier (if present)
129       Matcher matcher = WEBVTT_CUE_IDENTIFIER.matcher(line);
130       if (matcher.find()) {
131         cue.setId(line);
132         line = webvttReader.readLine();
133       }
134 
135       // Parse the cue timestamps
136       matcher = WEBVTT_TIMESTAMP.matcher(line);
137 
138       // Parse start timestamp
139       if (!matcher.find()) {
140         throw new SubtitleParsingException("Expected cue start time: " + line);
141       } else {
142         cue.setStartTime(parseTimestamp(matcher.group()));
143       }
144 
145       // Parse end timestamp
146       if (!matcher.find()) {
147         throw new SubtitleParsingException("Expected cue end time: " + line);
148       } else {
149         cue.setEndTime(parseTimestamp(matcher.group()));
150       }
151 
152       // Parse cue settings list
153       String cueSettings = line.substring(matcher.end()).trim();
154       if (!cueSettings.isEmpty()) {
155         cue.setCueSettingsList(cueSettings);
156       }
157 
158 
159       // Parse text
160       while (((line = webvttReader.readLine()) != null) && (!line.isEmpty())) {
161         cue.addLine(line);
162       }
163 
164       subtitle.addCue(cue);
165     }
166 
167     webvttReader.close();
168     is.close();
169 
170     return subtitle;
171   }
172 
173   private static long parseTimestamp(String s) throws NumberFormatException {
174     if (!s.matches(WEBVTT_TIMESTAMP_STRING)) {
175       throw new NumberFormatException("has invalid format");
176     }
177 
178     String[] parts = s.split("\\.", 2);
179     long value = 0;
180     for (String group : parts[0].split(":")) {
181       value = value * 60 + Long.parseLong(group);
182     }
183     return (value * 1000 + Long.parseLong(parts[1]));
184   }
185 }