server/src/app/Libraries/Transcript/TranscriberTranscriptConverter.php
author ymh <ymh.work@gmail.com>
Mon, 06 Feb 2017 14:36:25 +0100
changeset 499 b5cff30efa0a
parent 476 9cffc7f32f14
permissions -rw-r--r--
add forgotten 'created' field in documents results

<?php

namespace CorpusParole\Libraries\Transcript;

class TranscriberTranscriptConverter extends TranscriptConverterBase {

    private $topics = [];
    private $topicIds = [];
    private $speakers = [];
    private $speakerIds = [];
    private $lists = [];
    private $annotationTypes = [];
    private $annotations = [];
    private $turnCounter = 1;
    private $annotationCounter = 1;

    private function parseTopics() {

        $xpath = new \DOMXPath($this->source);
        foreach($xpath->evaluate("/Trans/Topics/Topic") as $i=>$topicNode) {
            $topicId = $this->document->getId()."_tpc".sprintf("%03d",$i+1);
            $this->topicIds[$topicNode->getAttribute('id')] = $topicId;
            array_push($this->topics,[
                'id' => $topicId,
                'desc' => $topicNode->getAttribute('desc')
            ]);
        }
    }

    private function parseSpeakers() {
        $xpath = new \DOMXPath($this->source);
        foreach($xpath->evaluate("/Trans/Speakers/Speaker") as $i=>$speakerNode) {
            $speakerId = $this->document->getId()."_spkr".sprintf("%03d",$i+1);
            $this->speakerIds[$speakerNode->getAttribute('id')] = $speakerId;
            array_push($this->speakers,[
                'id' => $speakerId,
                'name' => $speakerNode->getAttribute('name')
            ]);
        }
    }

    private function buildTurnAnnotations($turnNode, $turnId, $begin, $end) {

        $currentAnnotations = [];
        $currentBegin = $begin;
        $currentSpeaker = null;
        $turnSpeakers = array_reduce(
            explode(" ", $turnNode->getAttribute('speaker')),
            function($res, $spk) {
                array_push(
                    $res,
                    array_key_exists($spk,$this->speakerIds)?['id-ref' => $this->speakerIds[$spk]]:$spk
                );
                return $res;
            },
            []
        );
        if(count($turnSpeakers) == 1) {
            $currentSpeaker = $turnSpeakers[0];
        }

        foreach($turnNode->childNodes as $cnode) {
            if($cnode->nodeType === XML_TEXT_NODE) {
                $textContent = trim($cnode->textContent);
                if(empty($textContent))
                    continue;
                $aData = $currentSpeaker?["speaker" => $currentSpeaker]:[];
                $aData['content'] = $textContent;
                $newAnnotation = [
                    'id' => $this->document->getId()."_a".sprintf("%04d", $this->annotationCounter++),
                    'begin' => $currentBegin,
                    'end' => $end,
                    'media' => $this->getMediaRefId(),
                    'type' => $turnId,
                    'content' => [ "mimetype" => "application/json", "data" => $aData],
                    'meta' => [ 'id-ref' => $turnId ]
                ];
                array_push($currentAnnotations, $newAnnotation);

            } elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Who") {

                $currentSpeaker = $turnSpeakers[intval($cnode->getAttribute('nb'))-1];

            } elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Sync") {

                $currentBegin = floatval($cnode->getAttribute('time')) * 1000;
                foreach($currentAnnotations as &$nAnnot) {
                    $nAnnot['end'] = $currentBegin;
                }
                $this->annotations = array_merge($this->annotations, $currentAnnotations);
                $currentAnnotations = [];
            } elseif($cnode->nodeType === XML_ELEMENT_NODE) {
                $content = null;
                switch ($cnode->tagName) {
                    case 'Background':
                        $content = [
                            'ctype' => strtolower($cnode->tagName),
                            'type' => $cnode->getAttribute('type'),
                            'level' => $cnode->getAttribute('level')
                        ];
                        break;
                    case 'Event':
                        $content = [
                            'ctype' => strtolower($cnode->tagName),
                            'type' => $cnode->getAttribute('type'),
                            'extent' => $cnode->getAttribute('extent'),
                            'desc' => $cnode->getAttribute('desc')
                        ];
                        break;
                    case 'Comment':
                    case 'Vocal':
                        $content = [
                            'ctype' => strtolower($cnode->tagName),
                            'desc' => $cnode->getAttribute('desc')
                        ];
                        break;
                    default:
                        continue;
                }
                if(is_null($content)) {
                    continue;
                }
                $aData = $currentSpeaker?["speaker" => $currentSpeaker]:[];
                $aData['content'] = $content;
                $newAnnotation = [
                    'id' => $this->document->getId()."_a".sprintf("%04d", $this->annotationCounter++),
                    'begin' => $currentBegin,
                    'end' => $end,
                    'media' => $this->getMediaRefId(),
                    'type' => $turnId,
                    'content' => [ "mimetype" => "application/json", "data" => $aData],
                    'meta' => [ 'id-ref' => $turnId ]
                ];
                array_push($currentAnnotations, $newAnnotation);
            }
        }
        $this->annotations = array_merge($this->annotations, $currentAnnotations);

    }

    private function parseTurn($turnNode) {

        $turnIndex = $this->turnCounter++;
        $turnId = $this->document->getId()."_trn".sprintf("%04d", $turnIndex);
        $begin = floatval($turnNode->getAttribute("startTime")) * 1000;
        $end =  floatval($turnNode->getAttribute("endTime")) * 1000;
        $turn = [
            'id' => $turnId,
            'dc:title' => "Turn $turnIndex",
            'corpus:begin' => $begin,
            'corpus:end' => $end
        ];
        array_push($this->annotationTypes, $turn);

        $this->buildTurnAnnotations($turnNode, $turnId, $begin, $end);

        return $turnId;
    }

    private function parseSections() {
        $xpath = new \DOMXPath($this->source);
        foreach($xpath->evaluate("/Trans/Episode/Section") as $sectionIndex=>$sectionNode) {
            $sectionItems = [];
            foreach($sectionNode->childNodes as $turnNode) {
                if($turnNode->nodeType === XML_ELEMENT_NODE && $turnNode->tagName === 'Turn') {
                    array_push($sectionItems, [ "id-ref" => $this->parseTurn($turnNode)]);
                }
            }
            $section = [
                'id' => $this->document->getId()."_sctn".sprintf("%03d", $sectionIndex+1),
                'items' => $sectionItems,
                'meta' => [
                    'corpus:begin' => floatval($sectionNode->getAttribute('startTime'))*1000,
                    'corpus:end' => floatval($sectionNode->getAttribute('endTime'))*1000,
                ]
            ];
            $topicRef = $sectionNode->getAttribute('topic');
            if(!empty($topicRef) && array_key_exists($topicRef, $this->topicIds)) {
                $section['meta']['corpus:topic'] = ["id-ref" => $this->topicIds[$topicRef]];
            }
            array_push($this->lists, $section);
        }
    }

    public function parseSource() {
        // do nothing
        $this->parseTopics();
        $this->parseSpeakers();
        $this->parseSections();
    }

        // add resources
    public function buildResources() {
        return [
            ['id' => "topics"  , "content" => ['mimetype' => 'application/json', 'data' => $this->topics]],
            ['id' => "speakers", "content" => ['mimetype' => 'application/json', 'data' => $this->speakers]],
        ];
    }

    // add lists
    public function buildLists() {
        return $this->lists;
    }

    // add annotation types
    public function buildAnnotationTypes() {
        return $this->annotationTypes;
    }


    public function buildAnnotations() {
        return $this->annotations;
    }


}