server/src/app/Libraries/Transcript/LacitoTranscriptConverter.php
author ymh <ymh.work@gmail.com>
Sat, 03 Dec 2016 00:09:28 +0100
changeset 461 9b7a6c099870
parent 162 a6cf5a06f02d
permissions -rw-r--r--
add morphenes to transcripts

<?php

namespace CorpusParole\Libraries\Transcript;

class LacitoTranscriptConverter extends TranscriptConverterBase {

    // get document title
    public function getSourceTitle() {
        $baseNode = $this->source->documentElement;
        $defaultLanguage = $baseNode->hasAttribute('xml:lang')?$baseNode->getAttribute('xml:lang'):'und';
        $resTitle = [];
        $xpath = new \DOMXPath($this->source);
        foreach($xpath->evaluate("/TEXT/HEADER/TITLE") as $headerTitle) {
            $resTitle[$headerTitle->hasAttribute("xml:lang")?$headerTitle->getAttribute('xml:lang'):$defaultLanguage] = (string)$headerTitle->textContent;
        }
        return $resTitle?:null;
    }


    // do nothing
    public function parseSource() {
        // do nothing
    }


    private function addAudioInfo($audioNode, &$annotation) {
        $annotation['begin'] = floatval($audioNode->getAttribute('start'))*1000;
        $annotation['end'] = floatval($audioNode->getAttribute('end'))*1000;
    }

    private function addFormInfo($formNode, &$annotation) {
        $annotation['content']['data']['content'] = $formNode->textContent;
    }

    private function addTranslInfo($translNode, &$annotation) {
        $annotation['content']['data']['transl']
            = $this->buildTextvalue($translNode->textContent, $translNode->getAttribute('xml:lang'));
    }

    private function addWordInfo($wNode, &$annotation) {
        if(!array_key_exists('words',$annotation['content']['data'])) {
            $annotation['content']['data']['words'] = [];
        }
        $wLang = $wNode->getAttribute('xml:lang');

        $content = null;
        $transl = null ;
        $morphenes = [];
        $wbegin = null;
        $wend = null;

        foreach ($wNode->childNodes as $node) {
            if($node->nodeName === "FORM" && is_null($content)) {
                $content = $this->buildTextvalue($node->textContent, $wLang);
            } elseif($node->nodeName === "TRANSL" && is_null($transl)) {
                $transl = $this->buildTextvalue($node->textContent, $node->getAttribute('xml:lang'));
            } elseif($node->nodeName === "M") {
                $morphInfo = $this->getMorpheneInfo($node,$wLang);
                if(!is_null($morphInfo)) {
                    array_push($morphenes, $morphInfo);
                }
            } elseif($node->nodeName === 'AUDIO') {
                $wbegin = $audio->getAttribute('start');
                $wend   = $audio->getAttribute('end');
            }
        }

        $wDef = ['content' => $content, 'transl' => $transl, 'morphenes' => $morphenes];

        if(!empty($wbegin) && !empty($wend)) {
            $wDef['begin'] = intval($wbegin);
            $wDef['end'] = intval($wend);
        }
        array_push($annotation['content']['data']['words'], $wDef);
    }

    private function getMorpheneInfo($mNode, $wLang) {
        $mLang = $mNode->getAttribute('xml:lang');
        if(empty($mLang)) {
            $mLang = $wLang;
        }
        $content = null;
        $transl = null;
        $mbegin = null;
        $mend = null;

        foreach ($mNode->childNodes as $node) {
            if($node->nodeName === "FORM" && is_null($content)) {
                $content = $this->buildTextvalue($node->textContent, $mLang);
            } elseif($node->nodeName === "TRANSL" && is_null($transl)) {
                $transl = $this->buildTextvalue($node->textContent, $node->getAttribute('xml:lang'));
            } elseif($node->nodeName === 'AUDIO') {
                $mbegin = $node->getAttribute('start');
                $mend   = $node->getAttribute('end');
            }
        }
        $mDef = ['content' => $content, 'transl' => $transl];

        if(!empty($mbegin) && !empty($mend)) {
            $mDef['begin'] = intval($mbegin);
            $mDef['end'] = intval($mend);
        }

        $mClass = $mNode->getAttribute('class');
        if(!empty($mClass)) {
            $mDef['class'] = $mClass;
        }
        $mSclass = $mNode->getAttribute('sclass');
        if(!empty($mSclass)) {
            $mDef['sclass'] = $mSclass;
        }

        return $mDef;
    }


    public function buildAnnotations() {
        $xpath = new \DOMXPath($this->source);

        $annotationList = [];

        foreach($xpath->evaluate("/TEXT/S") as $i => $s) {
            $data = [];
            $speaker = $s->getAttribute('who');
            if($speaker) {
                $data['speaker'] = $speaker;
            }

            $annotation = [
                "id" => $this->document->getId()."_a".sprintf("%03d",$i+1),
                "media" => $this->getMediaRefId(),
                "content" => [
                    "mimetype" => "application/json",
                    "data" => $data
                ]
            ];

            foreach($s->childNodes as $cnode) {
                if($cnode->nodeType === XML_ELEMENT_NODE) {
                    switch($cnode->tagName) {
                    case "AUDIO":
                        $this->addAudioInfo($cnode, $annotation);
                        break;
                    case "FORM":
                        $this->addFormInfo($cnode, $annotation);
                        break;
                    case "TRANSL":
                        $this->addTranslInfo($cnode, $annotation);
                        break;
                    case "W":
                        $this->addWordInfo($cnode, $annotation);
                        break;
                    }
                }
            }

            array_push($annotationList, $annotation);
        }

        return $annotationList;
    }


}