<?php
namespace CorpusParole\Libraries\Transcript;
class LacitoTranscriptConverter extends TranscriptConverterBase {
// get document title
public function getSourceTitle() {
$baseNode = $this->source->documentElement;
$defaultLanguage = $baseNode->hasAttribute('xml:lang')?$baseNode->getAttribute('xml:lang'):'und';
$resTitle = [];
$xpath = new \DOMXPath($this->source);
foreach($xpath->evaluate("/TEXT/HEADER/TITLE") as $headerTitle) {
$resTitle[$headerTitle->hasAttribute("xml:lang")?$headerTitle->getAttribute('xml:lang'):$defaultLanguage] = (string)$headerTitle->textContent;
}
return $resTitle?:null;
}
// do nothing
public function parseSource() {
// do nothing
}
private function addAudioInfo($audioNode, &$annotation) {
$annotation['begin'] = floatval($audioNode->getAttribute('start'))*1000;
$annotation['end'] = floatval($audioNode->getAttribute('end'))*1000;
}
private function addFormInfo($formNode, &$annotation) {
$annotation['content']['data']['content'] = $formNode->textContent;
}
private function addTranslInfo($translNode, &$annotation) {
$annotation['content']['data']['transl']
= $this->buildTextvalue($translNode->textContent, $translNode->getAttribute('xml:lang'));
}
private function addWordInfo($wNode, &$annotation) {
if(!array_key_exists('words',$annotation['content']['data'])) {
$annotation['content']['data']['words'] = [];
}
$wLang = $wNode->getAttribute('xml:lang');
$content = null;
$transl = null ;
$morphenes = [];
$wbegin = null;
$wend = null;
foreach ($wNode->childNodes as $node) {
if($node->nodeName === "FORM" && is_null($content)) {
$content = $this->buildTextvalue($node->textContent, $wLang);
} elseif($node->nodeName === "TRANSL" && is_null($transl)) {
$transl = $this->buildTextvalue($node->textContent, $node->getAttribute('xml:lang'));
} elseif($node->nodeName === "M") {
$morphInfo = $this->getMorpheneInfo($node,$wLang);
if(!is_null($morphInfo)) {
array_push($morphenes, $morphInfo);
}
} elseif($node->nodeName === 'AUDIO') {
$wbegin = $audio->getAttribute('start');
$wend = $audio->getAttribute('end');
}
}
$wDef = ['content' => $content, 'transl' => $transl, 'morphenes' => $morphenes];
if(!empty($wbegin) && !empty($wend)) {
$wDef['begin'] = intval($wbegin);
$wDef['end'] = intval($wend);
}
array_push($annotation['content']['data']['words'], $wDef);
}
private function getMorpheneInfo($mNode, $wLang) {
$mLang = $mNode->getAttribute('xml:lang');
if(empty($mLang)) {
$mLang = $wLang;
}
$content = null;
$transl = null;
$mbegin = null;
$mend = null;
foreach ($mNode->childNodes as $node) {
if($node->nodeName === "FORM" && is_null($content)) {
$content = $this->buildTextvalue($node->textContent, $mLang);
} elseif($node->nodeName === "TRANSL" && is_null($transl)) {
$transl = $this->buildTextvalue($node->textContent, $node->getAttribute('xml:lang'));
} elseif($node->nodeName === 'AUDIO') {
$mbegin = $node->getAttribute('start');
$mend = $node->getAttribute('end');
}
}
$mDef = ['content' => $content, 'transl' => $transl];
if(!empty($mbegin) && !empty($mend)) {
$mDef['begin'] = intval($mbegin);
$mDef['end'] = intval($mend);
}
$mClass = $mNode->getAttribute('class');
if(!empty($mClass)) {
$mDef['class'] = $mClass;
}
$mSclass = $mNode->getAttribute('sclass');
if(!empty($mSclass)) {
$mDef['sclass'] = $mSclass;
}
return $mDef;
}
public function buildAnnotations() {
$xpath = new \DOMXPath($this->source);
$annotationList = [];
foreach($xpath->evaluate("/TEXT/S") as $i => $s) {
$data = [];
$speaker = $s->getAttribute('who');
if($speaker) {
$data['speaker'] = $speaker;
}
$annotation = [
"id" => $this->document->getId()."_a".sprintf("%03d",$i+1),
"media" => $this->getMediaRefId(),
"content" => [
"mimetype" => "application/json",
"data" => $data
]
];
foreach($s->childNodes as $cnode) {
if($cnode->nodeType === XML_ELEMENT_NODE) {
switch($cnode->tagName) {
case "AUDIO":
$this->addAudioInfo($cnode, $annotation);
break;
case "FORM":
$this->addFormInfo($cnode, $annotation);
break;
case "TRANSL":
$this->addTranslInfo($cnode, $annotation);
break;
case "W":
$this->addWordInfo($cnode, $annotation);
break;
}
}
}
array_push($annotationList, $annotation);
}
return $annotationList;
}
}