WebVTTParser.java

/*
 * Licensed to The Apereo Foundation under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 *
 * The Apereo Foundation licenses this file to you under the Educational
 * Community License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License
 * at:
 *
 *   http://opensource.org/licenses/ecl2.txt
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 */
package org.opencastproject.subtitleparser.webvttparser;

import org.opencastproject.subtitleparser.SubtitleParsingException;

import org.apache.commons.io.input.BOMInputStream;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Parses WebVTT from a file into a datastructure to allow for easy modification.
 * Throws exceptions if the read WebVTT is invalid.
 *
 * TODO: Comments are currently ignored and discarded. Find a good way to keep comments
 *  without compromising easy editing.
 */
public class WebVTTParser {

  private static final String WEBVTT_METADATA_HEADER_STRING = "\\S*[:=]\\S*";
  private static final Pattern WEBVTT_METADATA_HEADER =
          Pattern.compile(WEBVTT_METADATA_HEADER_STRING);

  // Regex checks if not a time interval
  private static final String WEBVTT_CUE_IDENTIFIER_STRING = "^(?!.*(-->)).*$";
  private static final Pattern WEBVTT_CUE_IDENTIFIER =
          Pattern.compile(WEBVTT_CUE_IDENTIFIER_STRING);

  // Timestamp from time interval
  private static final String WEBVTT_TIMESTAMP_STRING = "(\\d+:)?[0-5]\\d:[0-5]\\d\\.\\d{3}";
  private static final Pattern WEBVTT_TIMESTAMP = Pattern.compile(WEBVTT_TIMESTAMP_STRING);

  private Charset charset; // Charset of the input files

  public WebVTTParser() {
    this.charset = StandardCharsets.UTF_8;
  }

  public WebVTTParser(Charset charset) {
    this.charset = charset;
  }

  public WebVTTSubtitle parse(InputStream is) throws IOException, SubtitleParsingException {
    // Wrap input stream into Apache Commons IO BOMInputStream
    BOMInputStream bomIn = new BOMInputStream(is, false);

    // Create subtitle object
    WebVTTSubtitle subtitle = new WebVTTSubtitle();

    // Read each line
    BufferedReader webvttReader = new BufferedReader(new InputStreamReader(bomIn, this.charset));
    String line = "";

    // File should start with "WEBVTT" on the first line
    line = webvttReader.readLine();
    if (line == null) {
      throw new SubtitleParsingException("WEBVTT Header line is null");
    }

    if (!line.startsWith("WEBVTT")) {
      throw new SubtitleParsingException("Header line did not start with WEBVTT. Got " + line);
    }

    subtitle.addHeaderLine(line);

    // While this is not mentioned in the W3C specs, it seems to be common practice to have additional lines after
    // the header containing metadata information on the file.
    while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
      subtitle.addHeaderLine(line);
    }

    // Process the cues
    while ((line = webvttReader.readLine()) != null) {
      WebVTTSubtitleCue cue = new WebVTTSubtitleCue();

      // Skip additional newlines
      if (line.isEmpty()) {
        continue;
      }

      if (line.startsWith("REGION")) {
        WebVTTSubtitleRegion region = new WebVTTSubtitleRegion();
        region.addLine(line);
        while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
          region.addLine(line);
        }
        subtitle.addRegion(region);
        continue;
      }

      if (line.startsWith("STYLE")) {
        WebVTTSubtitleStyle style = new WebVTTSubtitleStyle();
        style.addLine(line);
        while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
          style.addLine(line);
        }
        subtitle.addStyle(style);
        continue;
      }

      if (line.startsWith("NOTE")) {
        while ((line = webvttReader.readLine()) != null && !line.isEmpty()) {
          // do nothing
        }
        continue;
      }

      // Parse the cue identifier (if present)
      Matcher matcher = WEBVTT_CUE_IDENTIFIER.matcher(line);
      if (matcher.find()) {
        cue.setId(line);
        line = webvttReader.readLine();
      }

      // Parse the cue timestamps
      matcher = WEBVTT_TIMESTAMP.matcher(line);

      // Parse start timestamp
      if (!matcher.find()) {
        throw new SubtitleParsingException("Expected cue start time: " + line);
      } else {
        cue.setStartTime(parseTimestamp(matcher.group()));
      }

      // Parse end timestamp
      if (!matcher.find()) {
        throw new SubtitleParsingException("Expected cue end time: " + line);
      } else {
        cue.setEndTime(parseTimestamp(matcher.group()));
      }

      // Parse cue settings list
      String cueSettings = line.substring(matcher.end()).trim();
      if (!cueSettings.isEmpty()) {
        cue.setCueSettingsList(cueSettings);
      }


      // Parse text
      while (((line = webvttReader.readLine()) != null) && (!line.isEmpty())) {
        cue.addLine(line);
      }

      subtitle.addCue(cue);
    }

    webvttReader.close();
    is.close();

    return subtitle;
  }

  private static long parseTimestamp(String s) throws NumberFormatException {
    if (!s.matches(WEBVTT_TIMESTAMP_STRING)) {
      throw new NumberFormatException("has invalid format");
    }

    String[] parts = s.split("\\.", 2);
    long value = 0;
    for (String group : parts[0].split(":")) {
      value = value * 60 + Long.parseLong(group);
    }
    return (value * 1000 + Long.parseLong(parts[1]));
  }
}