View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  
22  package org.opencastproject.caption.converters;
23  
24  import org.opencastproject.caption.api.Caption;
25  import org.opencastproject.caption.api.CaptionConverter;
26  import org.opencastproject.caption.api.CaptionConverterException;
27  import org.opencastproject.caption.api.IllegalTimeFormatException;
28  import org.opencastproject.caption.api.Time;
29  import org.opencastproject.caption.impl.CaptionImpl;
30  import org.opencastproject.caption.impl.TimeImpl;
31  import org.opencastproject.caption.util.TimeUtil;
32  import org.opencastproject.mediapackage.MediaPackageElement;
33  import org.opencastproject.mediapackage.MediaPackageElement.Type;
34  import org.opencastproject.util.XmlSafeParser;
35  
36  import org.apache.commons.io.IOUtils;
37  import org.osgi.service.component.annotations.Component;
38  import org.slf4j.Logger;
39  import org.slf4j.LoggerFactory;
40  import org.w3c.dom.Document;
41  import org.w3c.dom.Element;
42  import org.w3c.dom.Node;
43  import org.w3c.dom.NodeList;
44  import org.xml.sax.Attributes;
45  import org.xml.sax.SAXException;
46  import org.xml.sax.helpers.DefaultHandler;
47  
48  import java.io.IOException;
49  import java.io.InputStream;
50  import java.io.OutputStream;
51  import java.io.OutputStreamWriter;
52  import java.util.ArrayList;
53  import java.util.LinkedList;
54  import java.util.List;
55  
56  import javax.xml.parsers.DocumentBuilder;
57  import javax.xml.parsers.ParserConfigurationException;
58  import javax.xml.parsers.SAXParser;
59  import javax.xml.parsers.SAXParserFactory;
60  import javax.xml.transform.Transformer;
61  import javax.xml.transform.TransformerConfigurationException;
62  import javax.xml.transform.TransformerException;
63  import javax.xml.transform.TransformerFactory;
64  import javax.xml.transform.dom.DOMSource;
65  import javax.xml.transform.stream.StreamResult;
66  
67  /**
68   * This is converter for DFXP, XML based caption format. DOM parser is used for both caption importing and exporting,
69   * while SAX parser is used for determining which languages are present (DFXP can contain multiple languages).
70   */
71  @Component(
72      immediate = true,
73      service = { CaptionConverter.class },
74      property = {
75          "service.description=DFXP caption converter",
76          "caption.format=dfxp"
77      }
78  )
79  public class DFXPCaptionConverter implements CaptionConverter {
80  
81    /** logging utility */
82    private static final Logger logger = LoggerFactory.getLogger(DFXPCaptionConverter.class);
83  
84    private static final String EXTENSION = "dfxp.xml";
85  
86    /**
87     * {@inheritDoc} Parser used for parsing XML document is DOM parser. Language parameter will determine which language
88     * is searched for and parsed. If there is no matching language, empty collection is returned. If language parameter
89     * is <code>null</code> first language found is parsed.
90     *
91     * @see org.opencastproject.caption.api.CaptionConverter#importCaption(java.io.InputStream, java.lang.String)
92     */
93    @Override
94    public List<Caption> importCaption(InputStream in, String language) throws CaptionConverterException {
95  
96      // create new collection
97      List<Caption> collection = new ArrayList<Caption>();
98  
99      Document doc;
100     try {
101       DocumentBuilder builder = XmlSafeParser.newDocumentBuilderFactory().newDocumentBuilder();
102       doc = builder.parse(in);
103       doc.getDocumentElement().normalize();
104     } catch (ParserConfigurationException e) {
105       throw new CaptionConverterException("Could not parse captions", e);
106     } catch (SAXException e) {
107       throw new CaptionConverterException("Could not parse captions", e);
108     } catch (IOException e) {
109       throw new CaptionConverterException("Could not parse captions", e);
110     }
111 
112     // get all <div> elements since they contain information about language
113     NodeList divElements = doc.getElementsByTagName("div");
114 
115     Element targetDiv = null;
116     if (language != null) {
117       // find first <div> element with matching language
118       for (int i = 0; i < divElements.getLength(); i++) {
119         Element n = (Element) divElements.item(i);
120         if (n.getAttribute("xml:lang").equals(language)) {
121           targetDiv = n;
122           break;
123         }
124       }
125     } else {
126       if (divElements.getLength() > 1) {
127         // more than one existing <div> element, no language specified
128         logger.warn("More than one <div> element available. Parsing first one...");
129       }
130       if (divElements.getLength() != 0) {
131         targetDiv = (Element) divElements.item(0);
132       }
133     }
134 
135     // check if we found node
136     if (targetDiv == null) {
137       logger.warn("No suitable <div> element found for language {}", language);
138     } else {
139       NodeList pElements = targetDiv.getElementsByTagName("p");
140 
141       // initialize start time
142       Time time = null;
143       try {
144         time = new TimeImpl(0, 0, 0, 0);
145       } catch (IllegalTimeFormatException e1) {
146       }
147 
148       for (int i = 0; i < pElements.getLength(); i++) {
149         try {
150           Caption caption = parsePElement((Element) pElements.item(i));
151           // check time
152           if (caption.getStartTime().compareTo(time) < 0
153                   || caption.getStopTime().compareTo(caption.getStartTime()) <= 0) {
154             logger.warn("Caption with invalid time encountered. Skipping...");
155             continue;
156           }
157           collection.add(caption);
158         } catch (IllegalTimeFormatException e) {
159           logger.warn("Caption with invalid time format encountered. Skipping...");
160         }
161       }
162     }
163 
164     // return collection
165     return collection;
166   }
167 
168   /**
169    * Parse &lt;p&gt; element which contains one caption.
170    *
171    * @param p
172    *          &lt;p&gt; element to be parsed
173    * @return new {@link Caption} object
174    * @throws IllegalTimeFormatException
175    *           if time format does not match with expected format for DFXP
176    */
177   private Caption parsePElement(Element p) throws IllegalTimeFormatException {
178     Time begin = TimeUtil.importDFXP(p.getAttribute("begin").trim());
179     Time end = TimeUtil.importDFXP(p.getAttribute("end").trim());
180     // FIXME add logic for duration if end is absent
181 
182     // get text inside p
183     String[] textArray = getTextCore(p).split("\n");
184 
185     return new CaptionImpl(begin, end, textArray);
186   }
187 
188   /**
189    * Returns caption text stripped of all tags.
190    *
191    * @param p
192    *          &lt;p&gt; element to be parsed
193    * @return Caption text with \n as new line character
194    */
195   private String getTextCore(Node p) {
196     StringBuffer captionText = new StringBuffer();
197     // get children
198     NodeList list = p.getChildNodes();
199     for (int i = 0; i < list.getLength(); i++) {
200       if (list.item(i).getNodeType() == Node.TEXT_NODE) {
201         captionText.append(list.item(i).getTextContent());
202       } else if ("br".equals(list.item(i).getNodeName())) {
203         captionText.append("\n");
204       } else {
205         captionText.append(getTextCore(list.item(i)));
206       }
207     }
208     return captionText.toString().trim();
209   }
210 
211   /**
212    * {@inheritDoc} DOM parser is used to parse template from which whole document is then constructed.
213    */
214   @Override
215   public void exportCaption(OutputStream outputStream, List<Caption> captions, String language) throws IOException {
216     // get document builder factory and parse template
217     Document doc = null;
218     InputStream is = null;
219     try {
220       DocumentBuilder builder = XmlSafeParser.newDocumentBuilderFactory().newDocumentBuilder();
221       // load dfxp template from file
222       is = DFXPCaptionConverter.class.getResourceAsStream("/templates/template.dfxp.xml");
223       doc = builder.parse(is);
224     } catch (ParserConfigurationException e) {
225       // should not happen
226       throw new RuntimeException(e);
227     } catch (SAXException e) {
228       // should not happen unless template is invalid
229       throw new RuntimeException(e);
230     } catch (IOException e) {
231       // should not happen
232       throw new RuntimeException(e);
233     } finally {
234       IOUtils.closeQuietly(is);
235     }
236 
237     // retrieve body element
238     Node bodyNode = doc.getElementsByTagName("body").item(0);
239 
240     // create new div element with specified language
241     Element divNode = doc.createElement("div");
242     divNode.setAttribute("xml:lang", language != null ? language : "und");
243     bodyNode.appendChild(divNode);
244 
245     // update document
246     for (Caption caption : captions) {
247       Element newNode = doc.createElement("p");
248       newNode.setAttribute("begin", TimeUtil.exportToDFXP(caption.getStartTime()));
249       newNode.setAttribute("end", TimeUtil.exportToDFXP(caption.getStopTime()));
250       String[] captionText = caption.getCaption();
251       // text part
252       newNode.appendChild(doc.createTextNode(captionText[0]));
253       for (int i = 1; i < captionText.length; i++) {
254         newNode.appendChild(doc.createElement("br"));
255         newNode.appendChild(doc.createTextNode(captionText[i]));
256       }
257       divNode.appendChild(newNode);
258     }
259 
260     // initialize stream writer
261     OutputStreamWriter osw = new OutputStreamWriter(outputStream, "UTF-8");
262     StreamResult result = new StreamResult(osw);
263     DOMSource source = new DOMSource(doc);
264     TransformerFactory tfactory = XmlSafeParser.newTransformerFactory();
265     Transformer transformer;
266     try {
267       transformer = tfactory.newTransformer();
268       transformer.transform(source, result);
269       osw.flush();
270     } catch (TransformerConfigurationException e) {
271       // should not happen
272       throw new RuntimeException(e);
273     } catch (TransformerException e) {
274       // should not happen
275       throw new RuntimeException(e);
276     } finally {
277       IOUtils.closeQuietly(osw);
278     }
279   }
280 
281   /**
282    * {@inheritDoc} Uses SAX parser to quickly read the document and retrieve available languages.
283    *
284    * @see org.opencastproject.caption.api.CaptionConverter#getLanguageList(java.io.InputStream)
285    */
286   @Override
287   public String[] getLanguageList(InputStream input) throws CaptionConverterException {
288 
289     // create lang list
290     final List<String> langList = new LinkedList<String>();
291 
292     // get SAX parser
293     SAXParserFactory factory = XmlSafeParser.newSAXParserFactory();
294     try {
295       SAXParser parser = factory.newSAXParser();
296       // create handler
297       DefaultHandler handler = new DefaultHandler() {
298         @Override
299         public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
300           if ("div".equals(qName)) {
301             // we found div tag - let's make a lookup for language
302             String lang = attributes.getValue("xml:lang");
303             if (lang == null) {
304               // should never happen
305               logger.warn("Missing xml:lang attribute for div element.");
306             } else if (langList.contains(lang)) {
307               logger.warn("Multiple div elements with same language.");
308             } else {
309               langList.add(lang);
310             }
311           }
312         }
313       };
314 
315       // parse stream
316       parser.parse(input, handler);
317     } catch (ParserConfigurationException e) {
318       // should not happen
319       throw new RuntimeException(e);
320     } catch (SAXException e) {
321       throw new CaptionConverterException("Could not parse captions", e);
322     } catch (IOException e) {
323       throw new RuntimeException(e);
324     }
325 
326     return langList.toArray(new String[0]);
327   }
328 
329   /**
330    * {@inheritDoc}
331    *
332    * @see org.opencastproject.caption.api.CaptionConverter#getExtension()
333    */
334   @Override
335   public String getExtension() {
336     return EXTENSION;
337   }
338 
339   @Override
340   public Type getElementType() {
341     return MediaPackageElement.Type.Attachment;
342   }
343 
344 }