|
1 <?php |
|
2 |
|
3 namespace CorpusParole\Libraries\Transcript; |
|
4 |
|
5 class TranscriberTranscriptConverter extends TranscriptConverterBase { |
|
6 |
|
7 private $topics = []; |
|
8 private $topicIds = []; |
|
9 private $speakers = []; |
|
10 private $speakerIds = []; |
|
11 private $lists = []; |
|
12 private $annotationTypes = []; |
|
13 private $annotations = []; |
|
14 private $turnCounter = 1; |
|
15 private $annotationCounter = 1; |
|
16 |
|
17 private function parseTopics() { |
|
18 |
|
19 $xpath = new \DOMXPath($this->source); |
|
20 foreach($xpath->evaluate("/Trans/Topics/Topic") as $i=>$topicNode) { |
|
21 $topicId = $this->document->getId()."_tpc".sprintf("%03d",$i+1); |
|
22 $this->topicIds[$topicNode->getAttribute('id')] = $topicId; |
|
23 array_push($this->topics,[ |
|
24 'id' => $topicId, |
|
25 'desc' => $topicNode->getAttribute('desc') |
|
26 ]); |
|
27 } |
|
28 } |
|
29 |
|
30 private function parseSpeakers() { |
|
31 $xpath = new \DOMXPath($this->source); |
|
32 foreach($xpath->evaluate("/Trans/Speakers/Speaker") as $i=>$speakerNode) { |
|
33 $speakerId = $this->document->getId()."_spkr".sprintf("%03d",$i+1); |
|
34 $this->speakerIds[$speakerNode->getAttribute('id')] = $speakerId; |
|
35 array_push($this->speakers,[ |
|
36 'id' => $speakerId, |
|
37 'name' => $speakerNode->getAttribute('name') |
|
38 ]); |
|
39 } |
|
40 } |
|
41 |
|
42 private function buildTurnAnnotations($turnNode, $turnId, $begin, $end) { |
|
43 |
|
44 $currentAnnotations = []; |
|
45 $currentBegin = $begin; |
|
46 $currentSpeaker = null; |
|
47 $turnSpeakers = array_reduce( |
|
48 explode(" ", $turnNode->getAttribute('speaker')), |
|
49 function($res, $spk) { |
|
50 array_push( |
|
51 $res, |
|
52 array_key_exists($spk,$this->speakerIds)?['id-ref' => $this->speakerIds[$spk]]:$spk |
|
53 ); |
|
54 return $res; |
|
55 }, |
|
56 [] |
|
57 ); |
|
58 if(count($turnSpeakers) == 1) { |
|
59 $currentSpeaker = $turnSpeakers[0]; |
|
60 } |
|
61 |
|
62 foreach($turnNode->childNodes as $cnode) { |
|
63 if($cnode->nodeType === XML_TEXT_NODE) { |
|
64 $textContent = trim($cnode->textContent); |
|
65 if(empty($textContent)) |
|
66 continue; |
|
67 $aData = $currentSpeaker?["speaker" => $currentSpeaker]:[]; |
|
68 $aData['content'] = $textContent; |
|
69 $newAnnotation = [ |
|
70 'id' => $this->document->getId()."_a".sprintf("%04d", $this->annotationCounter++), |
|
71 'begin' => $currentBegin, |
|
72 'end' => $end, |
|
73 'media' => $this->getMediaRefId(), |
|
74 'type' => $turnId, |
|
75 'content' => [ "mimetype" => "application/json", "data" => $aData], |
|
76 'meta' => [ 'id-ref' => $turnId ] |
|
77 ]; |
|
78 array_push($currentAnnotations, $newAnnotation); |
|
79 |
|
80 } elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Who") { |
|
81 |
|
82 $currentSpeaker = $turnSpeakers[intval($cnode->getAttribute('nb'))-1]; |
|
83 |
|
84 } elseif($cnode->nodeType === XML_ELEMENT_NODE && $cnode->tagName === "Sync") { |
|
85 |
|
86 $currentBegin = floatval($cnode->getAttribute('time')) * 1000; |
|
87 foreach($currentAnnotations as &$nAnnot) { |
|
88 $nAnnot['end'] = $currentBegin; |
|
89 } |
|
90 $this->annotations = array_merge($this->annotations, $currentAnnotations); |
|
91 $currentAnnotations = []; |
|
92 } |
|
93 } |
|
94 $this->annotations = array_merge($this->annotations, $currentAnnotations); |
|
95 |
|
96 } |
|
97 |
|
98 private function parseTurn($turnNode) { |
|
99 |
|
100 $turnIndex = $this->turnCounter++; |
|
101 $turnId = $this->document->getId()."_trn".sprintf("%04d", $turnIndex); |
|
102 $begin = floatval($turnNode->getAttribute("startTime")) * 1000; |
|
103 $end = floatval($turnNode->getAttribute("endTime")) * 1000; |
|
104 $turn = [ |
|
105 'id' => $turnId, |
|
106 'dc:title' => "Turn $turnIndex", |
|
107 'corpus:begin' => $begin, |
|
108 'corpus:end' => $end |
|
109 ]; |
|
110 array_push($this->annotationTypes, $turn); |
|
111 |
|
112 $this->buildTurnAnnotations($turnNode, $turnId, $begin, $end); |
|
113 |
|
114 return $turnId; |
|
115 } |
|
116 |
|
117 private function parseSections() { |
|
118 $xpath = new \DOMXPath($this->source); |
|
119 foreach($xpath->evaluate("/Trans/Episode/Section") as $sectionIndex=>$sectionNode) { |
|
120 $sectionItems = []; |
|
121 foreach($sectionNode->childNodes as $turnNode) { |
|
122 if($turnNode->nodeType === XML_ELEMENT_NODE && $turnNode->tagName === 'Turn') { |
|
123 array_push($sectionItems, [ "id-ref" => $this->parseTurn($turnNode)]); |
|
124 } |
|
125 } |
|
126 $section = [ |
|
127 'id' => $this->document->getId()."_sctn".sprintf("%03d", $sectionIndex+1), |
|
128 'items' => $sectionItems, |
|
129 'meta' => [ |
|
130 'corpus:topic'=> ["id-ref" => $this->topicIds[$sectionNode->getAttribute('topic')]], |
|
131 'corpus:begin' => floatval($sectionNode->getAttribute('startTime'))*1000, |
|
132 'corpus:end' => floatval($sectionNode->getAttribute('endTime'))*1000, |
|
133 ] |
|
134 ]; |
|
135 |
|
136 array_push($this->lists, $section); |
|
137 } |
|
138 } |
|
139 |
|
140 public function parseSource() { |
|
141 // do nothing |
|
142 $this->parseTopics(); |
|
143 $this->parseSpeakers(); |
|
144 $this->parseSections(); |
|
145 } |
|
146 |
|
147 // add resources |
|
148 public function buildResources() { |
|
149 return [ |
|
150 ['id' => "topics" , "content" => ['mimetype' => 'application/json', 'data' => $this->topics]], |
|
151 ['id' => "speakers", "content" => ['mimetype' => 'application/json', 'data' => $this->speakers]], |
|
152 ]; |
|
153 } |
|
154 |
|
155 // add lists |
|
156 public function buildLists() { |
|
157 return $this->lists; |
|
158 } |
|
159 |
|
160 // add annotation types |
|
161 public function buildAnnotationTypes() { |
|
162 return $this->annotationTypes; |
|
163 } |
|
164 |
|
165 |
|
166 public function buildAnnotations() { |
|
167 return $this->annotations; |
|
168 } |
|
169 |
|
170 |
|
171 } |