DFXPCaptionConverter.java

/*
 * Licensed to The Apereo Foundation under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 *
 * The Apereo Foundation licenses this file to you under the Educational
 * Community License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License
 * at:
 *
 *   http://opensource.org/licenses/ecl2.txt
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 */

package org.opencastproject.caption.converters;

import org.opencastproject.caption.api.Caption;
import org.opencastproject.caption.api.CaptionConverter;
import org.opencastproject.caption.api.CaptionConverterException;
import org.opencastproject.caption.api.IllegalTimeFormatException;
import org.opencastproject.caption.api.Time;
import org.opencastproject.caption.impl.CaptionImpl;
import org.opencastproject.caption.impl.TimeImpl;
import org.opencastproject.caption.util.TimeUtil;
import org.opencastproject.mediapackage.MediaPackageElement;
import org.opencastproject.mediapackage.MediaPackageElement.Type;
import org.opencastproject.util.XmlSafeParser;

import org.apache.commons.io.IOUtils;
import org.osgi.service.component.annotations.Component;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

/**
 * This is converter for DFXP, XML based caption format. DOM parser is used for both caption importing and exporting,
 * while SAX parser is used for determining which languages are present (DFXP can contain multiple languages).
 */
@Component(
    immediate = true,
    service = { CaptionConverter.class },
    property = {
        "service.description=DFXP caption converter",
        "caption.format=dfxp"
    }
)
public class DFXPCaptionConverter implements CaptionConverter {

  /** logging utility */
  private static final Logger logger = LoggerFactory.getLogger(DFXPCaptionConverter.class);

  private static final String EXTENSION = "dfxp.xml";

  /**
   * {@inheritDoc} Parser used for parsing XML document is DOM parser. Language parameter will determine which language
   * is searched for and parsed. If there is no matching language, empty collection is returned. If language parameter
   * is <code>null</code> first language found is parsed.
   *
   * @see org.opencastproject.caption.api.CaptionConverter#importCaption(java.io.InputStream, java.lang.String)
   */
  @Override
  public List<Caption> importCaption(InputStream in, String language) throws CaptionConverterException {

    // create new collection
    List<Caption> collection = new ArrayList<Caption>();

    Document doc;
    try {
      DocumentBuilder builder = XmlSafeParser.newDocumentBuilderFactory().newDocumentBuilder();
      doc = builder.parse(in);
      doc.getDocumentElement().normalize();
    } catch (ParserConfigurationException e) {
      throw new CaptionConverterException("Could not parse captions", e);
    } catch (SAXException e) {
      throw new CaptionConverterException("Could not parse captions", e);
    } catch (IOException e) {
      throw new CaptionConverterException("Could not parse captions", e);
    }

    // get all <div> elements since they contain information about language
    NodeList divElements = doc.getElementsByTagName("div");

    Element targetDiv = null;
    if (language != null) {
      // find first <div> element with matching language
      for (int i = 0; i < divElements.getLength(); i++) {
        Element n = (Element) divElements.item(i);
        if (n.getAttribute("xml:lang").equals(language)) {
          targetDiv = n;
          break;
        }
      }
    } else {
      if (divElements.getLength() > 1) {
        // more than one existing <div> element, no language specified
        logger.warn("More than one <div> element available. Parsing first one...");
      }
      if (divElements.getLength() != 0) {
        targetDiv = (Element) divElements.item(0);
      }
    }

    // check if we found node
    if (targetDiv == null) {
      logger.warn("No suitable <div> element found for language {}", language);
    } else {
      NodeList pElements = targetDiv.getElementsByTagName("p");

      // initialize start time
      Time time = null;
      try {
        time = new TimeImpl(0, 0, 0, 0);
      } catch (IllegalTimeFormatException e1) {
      }

      for (int i = 0; i < pElements.getLength(); i++) {
        try {
          Caption caption = parsePElement((Element) pElements.item(i));
          // check time
          if (caption.getStartTime().compareTo(time) < 0
                  || caption.getStopTime().compareTo(caption.getStartTime()) <= 0) {
            logger.warn("Caption with invalid time encountered. Skipping...");
            continue;
          }
          collection.add(caption);
        } catch (IllegalTimeFormatException e) {
          logger.warn("Caption with invalid time format encountered. Skipping...");
        }
      }
    }

    // return collection
    return collection;
  }

  /**
   * Parse &lt;p&gt; element which contains one caption.
   *
   * @param p
   *          &lt;p&gt; element to be parsed
   * @return new {@link Caption} object
   * @throws IllegalTimeFormatException
   *           if time format does not match with expected format for DFXP
   */
  private Caption parsePElement(Element p) throws IllegalTimeFormatException {
    Time begin = TimeUtil.importDFXP(p.getAttribute("begin").trim());
    Time end = TimeUtil.importDFXP(p.getAttribute("end").trim());
    // FIXME add logic for duration if end is absent

    // get text inside p
    String[] textArray = getTextCore(p).split("\n");

    return new CaptionImpl(begin, end, textArray);
  }

  /**
   * Returns caption text stripped of all tags.
   *
   * @param p
   *          &lt;p&gt; element to be parsed
   * @return Caption text with \n as new line character
   */
  private String getTextCore(Node p) {
    StringBuffer captionText = new StringBuffer();
    // get children
    NodeList list = p.getChildNodes();
    for (int i = 0; i < list.getLength(); i++) {
      if (list.item(i).getNodeType() == Node.TEXT_NODE) {
        captionText.append(list.item(i).getTextContent());
      } else if ("br".equals(list.item(i).getNodeName())) {
        captionText.append("\n");
      } else {
        captionText.append(getTextCore(list.item(i)));
      }
    }
    return captionText.toString().trim();
  }

  /**
   * {@inheritDoc} DOM parser is used to parse template from which whole document is then constructed.
   */
  @Override
  public void exportCaption(OutputStream outputStream, List<Caption> captions, String language) throws IOException {
    // get document builder factory and parse template
    Document doc = null;
    InputStream is = null;
    try {
      DocumentBuilder builder = XmlSafeParser.newDocumentBuilderFactory().newDocumentBuilder();
      // load dfxp template from file
      is = DFXPCaptionConverter.class.getResourceAsStream("/templates/template.dfxp.xml");
      doc = builder.parse(is);
    } catch (ParserConfigurationException e) {
      // should not happen
      throw new RuntimeException(e);
    } catch (SAXException e) {
      // should not happen unless template is invalid
      throw new RuntimeException(e);
    } catch (IOException e) {
      // should not happen
      throw new RuntimeException(e);
    } finally {
      IOUtils.closeQuietly(is);
    }

    // retrieve body element
    Node bodyNode = doc.getElementsByTagName("body").item(0);

    // create new div element with specified language
    Element divNode = doc.createElement("div");
    divNode.setAttribute("xml:lang", language != null ? language : "und");
    bodyNode.appendChild(divNode);

    // update document
    for (Caption caption : captions) {
      Element newNode = doc.createElement("p");
      newNode.setAttribute("begin", TimeUtil.exportToDFXP(caption.getStartTime()));
      newNode.setAttribute("end", TimeUtil.exportToDFXP(caption.getStopTime()));
      String[] captionText = caption.getCaption();
      // text part
      newNode.appendChild(doc.createTextNode(captionText[0]));
      for (int i = 1; i < captionText.length; i++) {
        newNode.appendChild(doc.createElement("br"));
        newNode.appendChild(doc.createTextNode(captionText[i]));
      }
      divNode.appendChild(newNode);
    }

    // initialize stream writer
    OutputStreamWriter osw = new OutputStreamWriter(outputStream, "UTF-8");
    StreamResult result = new StreamResult(osw);
    DOMSource source = new DOMSource(doc);
    TransformerFactory tfactory = XmlSafeParser.newTransformerFactory();
    Transformer transformer;
    try {
      transformer = tfactory.newTransformer();
      transformer.transform(source, result);
      osw.flush();
    } catch (TransformerConfigurationException e) {
      // should not happen
      throw new RuntimeException(e);
    } catch (TransformerException e) {
      // should not happen
      throw new RuntimeException(e);
    } finally {
      IOUtils.closeQuietly(osw);
    }
  }

  /**
   * {@inheritDoc} Uses SAX parser to quickly read the document and retrieve available languages.
   *
   * @see org.opencastproject.caption.api.CaptionConverter#getLanguageList(java.io.InputStream)
   */
  @Override
  public String[] getLanguageList(InputStream input) throws CaptionConverterException {

    // create lang list
    final List<String> langList = new LinkedList<String>();

    // get SAX parser
    SAXParserFactory factory = XmlSafeParser.newSAXParserFactory();
    try {
      SAXParser parser = factory.newSAXParser();
      // create handler
      DefaultHandler handler = new DefaultHandler() {
        @Override
        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
          if ("div".equals(qName)) {
            // we found div tag - let's make a lookup for language
            String lang = attributes.getValue("xml:lang");
            if (lang == null) {
              // should never happen
              logger.warn("Missing xml:lang attribute for div element.");
            } else if (langList.contains(lang)) {
              logger.warn("Multiple div elements with same language.");
            } else {
              langList.add(lang);
            }
          }
        }
      };

      // parse stream
      parser.parse(input, handler);
    } catch (ParserConfigurationException e) {
      // should not happen
      throw new RuntimeException(e);
    } catch (SAXException e) {
      throw new CaptionConverterException("Could not parse captions", e);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    return langList.toArray(new String[0]);
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.caption.api.CaptionConverter#getExtension()
   */
  @Override
  public String getExtension() {
    return EXTENSION;
  }

  @Override
  public Type getElementType() {
    return MediaPackageElement.Type.Attachment;
  }

}