server/src/app/Libraries/Transcript/TranscriberTranscriptConverter.php
changeset 162 a6cf5a06f02d
child 460 686926d132ff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Libraries/Transcript/TranscriberTranscriptConverter.php	Sat May 28 11:49:38 2016 +0200
@@ -0,0 +1,171 @@
+<?php
+
+namespace CorpusParole\Libraries\Transcript;
+
+class TranscriberTranscriptConverter extends TranscriptConverterBase {
+
+    private $topics = [];
+    private $topicIds = [];
+    private $speakers = [];
+    private $speakerIds = [];
+    private $lists = [];
+    private $annotationTypes = [];
+    private $annotations = [];
+    private $turnCounter = 1;
+    private $annotationCounter = 1;
+
+    private function parseTopics() {
+
+        $xpath = new \DOMXPath($this->source);
+        foreach($xpath->evaluate("/Trans/Topics/Topic") as $i=>$topicNode) {
+            $topicId = $this->document->getId()."_tpc".sprintf("%03d",$i+1);
+            $this->topicIds[$topicNode->getAttribute('id')] = $topicId;
+            array_push($this->topics,[
+                'id' => $topicId,
+                'desc' => $topicNode->getAttribute('desc')
+            ]);
+        }
+    }
+
+    private function parseSpeakers() {
+        $xpath = new \DOMXPath($this->source);
+        foreach($xpath->evaluate("/Trans/Speakers/Speaker") as $i=>$speakerNode) {
+            $speakerId = $this->document->getId()."_spkr".sprintf("%03d",$i+1);
+            $this->speakerIds[$speakerNode->getAttribute('id')] = $speakerId;
+            array_push($this->speakers,[
+                'id' => $speakerId,
+                'name' => $speakerNode->getAttribute('name')
+            ]);
+        }
+    }
+
+    private function buildTurnAnnotations($turnNode, $turnId, $begin, $end) {
+
+        $currentAnnotations = [];
+        $currentBegin = $begin;
+        $currentSpeaker = null;
+        $turnSpeakers = array_reduce(
+            explode(" ", $turnNode->getAttribute('speaker')),
+            function($res, $spk) {
+                array_push(
+                    $res,
+                    array_key_exists($spk,$this->speakerIds)?['id-ref' => $this->speakerIds[$spk]]:$spk
+                );
+                return $res;
+            },
+            []
+        );
+        if(count($turnSpeakers) == 1) {
+            $currentSpeaker = $turnSpeakers[0];
+        }
+
+        foreach($turnNode->childNodes as $cnode) {
+            if($cnode->nodeType === XML_TEXT_NODE) {
+                $textContent = trim($cnode->textContent);
+                if(empty($textContent))
+                    continue;
+                $aData = $currentSpeaker?["speaker" => $currentSpeaker]:[];
+                $aData['content'] = $textContent;
+                $newAnnotation = [
+                    'id' => $this->document->getId()."_a".sprintf("%04d", $this->annotationCounter++),
+                    'begin' => $currentBegin,
+                    'end' => $end,
+                    'media' => $this->getMediaRefId(),
+                    'type' => $turnId,
+                    'content' => [ "mimetype" => "application/json", "data" => $aData],
+                    'meta' => [ 'id-ref' => $turnId ]
+                ];
+                array_push($currentAnnotations, $newAnnotation);
+
+            } elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Who") {
+
+                $currentSpeaker = $turnSpeakers[intval($cnode->getAttribute('nb'))-1];
+
+            } elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Sync") {
+
+                $currentBegin = floatval($cnode->getAttribute('time')) * 1000;
+                foreach($currentAnnotations as &$nAnnot) {
+                    $nAnnot['end'] = $currentBegin;
+                }
+                $this->annotations = array_merge($this->annotations, $currentAnnotations);
+                $currentAnnotations = [];
+            }
+        }
+        $this->annotations = array_merge($this->annotations, $currentAnnotations);
+
+    }
+
+    private function parseTurn($turnNode) {
+
+        $turnIndex = $this->turnCounter++;
+        $turnId = $this->document->getId()."_trn".sprintf("%04d", $turnIndex);
+        $begin = floatval($turnNode->getAttribute("startTime")) * 1000;
+        $end =  floatval($turnNode->getAttribute("endTime")) * 1000;
+        $turn = [
+            'id' => $turnId,
+            'dc:title' => "Turn $turnIndex",
+            'corpus:begin' => $begin,
+            'corpus:end' => $end
+        ];
+        array_push($this->annotationTypes, $turn);
+
+        $this->buildTurnAnnotations($turnNode, $turnId, $begin, $end);
+
+        return $turnId;
+    }
+
+    private function parseSections() {
+        $xpath = new \DOMXPath($this->source);
+        foreach($xpath->evaluate("/Trans/Episode/Section") as $sectionIndex=>$sectionNode) {
+            $sectionItems = [];
+            foreach($sectionNode->childNodes as $turnNode) {
+                if($turnNode->nodeType === XML_ELEMENT_NODE && $turnNode->tagName === 'Turn') {
+                    array_push($sectionItems, [ "id-ref" => $this->parseTurn($turnNode)]);
+                }
+            }
+            $section = [
+                'id' => $this->document->getId()."_sctn".sprintf("%03d", $sectionIndex+1),
+                'items' => $sectionItems,
+                'meta' => [
+                    'corpus:topic'=> ["id-ref" => $this->topicIds[$sectionNode->getAttribute('topic')]],
+                    'corpus:begin' => floatval($sectionNode->getAttribute('startTime'))*1000,
+                    'corpus:end' => floatval($sectionNode->getAttribute('endTime'))*1000,
+                ]
+            ];
+
+            array_push($this->lists, $section);
+        }
+    }
+
+    public function parseSource() {
+        // do nothing
+        $this->parseTopics();
+        $this->parseSpeakers();
+        $this->parseSections();
+    }
+
+        // add resources
+    public function buildResources() {
+        return [
+            ['id' => "topics"  , "content" => ['mimetype' => 'application/json', 'data' => $this->topics]],
+            ['id' => "speakers", "content" => ['mimetype' => 'application/json', 'data' => $this->speakers]],
+        ];
+    }
+
+    // add lists
+    public function buildLists() {
+        return $this->lists;
+    }
+
+    // add annotation types
+    public function buildAnnotationTypes() {
+        return $this->annotationTypes;
+    }
+
+
+    public function buildAnnotations() {
+        return $this->annotations;
+    }
+
+
+}
\ No newline at end of file