<?php
namespace CorpusParole\Libraries\Transcript;
class TranscriberTranscriptConverter extends TranscriptConverterBase {
private $topics = [];
private $topicIds = [];
private $speakers = [];
private $speakerIds = [];
private $lists = [];
private $annotationTypes = [];
private $annotations = [];
private $turnCounter = 1;
private $annotationCounter = 1;
private function parseTopics() {
$xpath = new \DOMXPath($this->source);
foreach($xpath->evaluate("/Trans/Topics/Topic") as $i=>$topicNode) {
$topicId = $this->document->getId()."_tpc".sprintf("%03d",$i+1);
$this->topicIds[$topicNode->getAttribute('id')] = $topicId;
array_push($this->topics,[
'id' => $topicId,
'desc' => $topicNode->getAttribute('desc')
]);
}
}
private function parseSpeakers() {
$xpath = new \DOMXPath($this->source);
foreach($xpath->evaluate("/Trans/Speakers/Speaker") as $i=>$speakerNode) {
$speakerId = $this->document->getId()."_spkr".sprintf("%03d",$i+1);
$this->speakerIds[$speakerNode->getAttribute('id')] = $speakerId;
array_push($this->speakers,[
'id' => $speakerId,
'name' => $speakerNode->getAttribute('name')
]);
}
}
private function buildTurnAnnotations($turnNode, $turnId, $begin, $end) {
$currentAnnotations = [];
$currentBegin = $begin;
$currentSpeaker = null;
$turnSpeakers = array_reduce(
explode(" ", $turnNode->getAttribute('speaker')),
function($res, $spk) {
array_push(
$res,
array_key_exists($spk,$this->speakerIds)?['id-ref' => $this->speakerIds[$spk]]:$spk
);
return $res;
},
[]
);
if(count($turnSpeakers) == 1) {
$currentSpeaker = $turnSpeakers[0];
}
foreach($turnNode->childNodes as $cnode) {
if($cnode->nodeType === XML_TEXT_NODE) {
$textContent = trim($cnode->textContent);
if(empty($textContent))
continue;
$aData = $currentSpeaker?["speaker" => $currentSpeaker]:[];
$aData['content'] = $textContent;
$newAnnotation = [
'id' => $this->document->getId()."_a".sprintf("%04d", $this->annotationCounter++),
'begin' => $currentBegin,
'end' => $end,
'media' => $this->getMediaRefId(),
'type' => $turnId,
'content' => [ "mimetype" => "application/json", "data" => $aData],
'meta' => [ 'id-ref' => $turnId ]
];
array_push($currentAnnotations, $newAnnotation);
} elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Who") {
$currentSpeaker = $turnSpeakers[intval($cnode->getAttribute('nb'))-1];
} elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Sync") {
$currentBegin = floatval($cnode->getAttribute('time')) * 1000;
foreach($currentAnnotations as &$nAnnot) {
$nAnnot['end'] = $currentBegin;
}
$this->annotations = array_merge($this->annotations, $currentAnnotations);
$currentAnnotations = [];
} elseif($cnode->nodeType === XML_ELEMENT_NODE) {
$content = null;
switch ($cnode->tagName) {
case 'Background':
$content = [
'ctype' => strtolower($cnode->tagName),
'type' => $cnode->getAttribute('type'),
'level' => $cnode->getAttribute('level')
];
break;
case 'Event':
$content = [
'ctype' => strtolower($cnode->tagName),
'type' => $cnode->getAttribute('type'),
'extent' => $cnode->getAttribute('extent'),
'desc' => $cnode->getAttribute('desc')
];
break;
case 'Comment':
case 'Vocal':
$content = [
'ctype' => strtolower($cnode->tagName),
'desc' => $cnode->getAttribute('desc')
];
break;
default:
continue;
}
if(is_null($content)) {
continue;
}
$aData = $currentSpeaker?["speaker" => $currentSpeaker]:[];
$aData['content'] = $content;
$newAnnotation = [
'id' => $this->document->getId()."_a".sprintf("%04d", $this->annotationCounter++),
'begin' => $currentBegin,
'end' => $end,
'media' => $this->getMediaRefId(),
'type' => $turnId,
'content' => [ "mimetype" => "application/json", "data" => $aData],
'meta' => [ 'id-ref' => $turnId ]
];
array_push($currentAnnotations, $newAnnotation);
}
}
$this->annotations = array_merge($this->annotations, $currentAnnotations);
}
private function parseTurn($turnNode) {
$turnIndex = $this->turnCounter++;
$turnId = $this->document->getId()."_trn".sprintf("%04d", $turnIndex);
$begin = floatval($turnNode->getAttribute("startTime")) * 1000;
$end = floatval($turnNode->getAttribute("endTime")) * 1000;
$turn = [
'id' => $turnId,
'dc:title' => "Turn $turnIndex",
'corpus:begin' => $begin,
'corpus:end' => $end
];
array_push($this->annotationTypes, $turn);
$this->buildTurnAnnotations($turnNode, $turnId, $begin, $end);
return $turnId;
}
private function parseSections() {
$xpath = new \DOMXPath($this->source);
foreach($xpath->evaluate("/Trans/Episode/Section") as $sectionIndex=>$sectionNode) {
$sectionItems = [];
foreach($sectionNode->childNodes as $turnNode) {
if($turnNode->nodeType === XML_ELEMENT_NODE && $turnNode->tagName === 'Turn') {
array_push($sectionItems, [ "id-ref" => $this->parseTurn($turnNode)]);
}
}
$section = [
'id' => $this->document->getId()."_sctn".sprintf("%03d", $sectionIndex+1),
'items' => $sectionItems,
'meta' => [
'corpus:topic'=> ["id-ref" => $this->topicIds[$sectionNode->getAttribute('topic')]],
'corpus:begin' => floatval($sectionNode->getAttribute('startTime'))*1000,
'corpus:end' => floatval($sectionNode->getAttribute('endTime'))*1000,
]
];
array_push($this->lists, $section);
}
}
public function parseSource() {
// do nothing
$this->parseTopics();
$this->parseSpeakers();
$this->parseSections();
}
// add resources
public function buildResources() {
return [
['id' => "topics" , "content" => ['mimetype' => 'application/json', 'data' => $this->topics]],
['id' => "speakers", "content" => ['mimetype' => 'application/json', 'data' => $this->speakers]],
];
}
// add lists
public function buildLists() {
return $this->lists;
}
// add annotation types
public function buildAnnotationTypes() {
return $this->annotationTypes;
}
public function buildAnnotations() {
return $this->annotations;
}
}