author | ymh <ymh.work@gmail.com> |
Sat, 03 Dec 2016 00:09:28 +0100 | |
changeset 461 | 9b7a6c099870 |
parent 162 | a6cf5a06f02d |
permissions | -rw-r--r-- |
162
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
1 |
<?php |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
2 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
3 |
namespace CorpusParole\Libraries\Transcript; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
4 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
5 |
class LacitoTranscriptConverter extends TranscriptConverterBase { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
6 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
7 |
// get document title |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
8 |
public function getSourceTitle() { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
9 |
$baseNode = $this->source->documentElement; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
10 |
$defaultLanguage = $baseNode->hasAttribute('xml:lang')?$baseNode->getAttribute('xml:lang'):'und'; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
11 |
$resTitle = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
12 |
$xpath = new \DOMXPath($this->source); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
13 |
foreach($xpath->evaluate("/TEXT/HEADER/TITLE") as $headerTitle) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
14 |
$resTitle[$headerTitle->hasAttribute("xml:lang")?$headerTitle->getAttribute('xml:lang'):$defaultLanguage] = (string)$headerTitle->textContent; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
15 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
16 |
return $resTitle?:null; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
17 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
18 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
19 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
20 |
// do nothing |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
21 |
public function parseSource() { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
22 |
// do nothing |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
23 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
24 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
25 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
26 |
private function addAudioInfo($audioNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
27 |
$annotation['begin'] = floatval($audioNode->getAttribute('start'))*1000; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
28 |
$annotation['end'] = floatval($audioNode->getAttribute('end'))*1000; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
29 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
30 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
31 |
private function addFormInfo($formNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
32 |
$annotation['content']['data']['content'] = $formNode->textContent; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
33 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
34 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
35 |
private function addTranslInfo($translNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
36 |
$annotation['content']['data']['transl'] |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
37 |
= $this->buildTextvalue($translNode->textContent, $translNode->getAttribute('xml:lang')); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
38 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
39 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
40 |
private function addWordInfo($wNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
41 |
if(!array_key_exists('words',$annotation['content']['data'])) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
42 |
$annotation['content']['data']['words'] = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
43 |
} |
461 | 44 |
$wLang = $wNode->getAttribute('xml:lang'); |
45 |
||
46 |
$content = null; |
|
47 |
$transl = null ; |
|
48 |
$morphenes = []; |
|
49 |
$wbegin = null; |
|
50 |
$wend = null; |
|
51 |
||
52 |
foreach ($wNode->childNodes as $node) { |
|
53 |
if($node->nodeName === "FORM" && is_null($content)) { |
|
54 |
$content = $this->buildTextvalue($node->textContent, $wLang); |
|
55 |
} elseif($node->nodeName === "TRANSL" && is_null($transl)) { |
|
56 |
$transl = $this->buildTextvalue($node->textContent, $node->getAttribute('xml:lang')); |
|
57 |
} elseif($node->nodeName === "M") { |
|
58 |
$morphInfo = $this->getMorpheneInfo($node,$wLang); |
|
59 |
if(!is_null($morphInfo)) { |
|
60 |
array_push($morphenes, $morphInfo); |
|
61 |
} |
|
62 |
} elseif($node->nodeName === 'AUDIO') { |
|
63 |
$wbegin = $audio->getAttribute('start'); |
|
64 |
$wend = $audio->getAttribute('end'); |
|
65 |
} |
|
66 |
} |
|
67 |
||
68 |
$wDef = ['content' => $content, 'transl' => $transl, 'morphenes' => $morphenes]; |
|
69 |
||
70 |
if(!empty($wbegin) && !empty($wend)) { |
|
71 |
$wDef['begin'] = intval($wbegin); |
|
72 |
$wDef['end'] = intval($wend); |
|
162
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
73 |
} |
461 | 74 |
array_push($annotation['content']['data']['words'], $wDef); |
75 |
} |
|
76 |
||
77 |
private function getMorpheneInfo($mNode, $wLang) { |
|
78 |
$mLang = $mNode->getAttribute('xml:lang'); |
|
79 |
if(empty($mLang)) { |
|
80 |
$mLang = $wLang; |
|
162
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
81 |
} |
461 | 82 |
$content = null; |
83 |
$transl = null; |
|
84 |
$mbegin = null; |
|
85 |
$mend = null; |
|
86 |
||
87 |
foreach ($mNode->childNodes as $node) { |
|
88 |
if($node->nodeName === "FORM" && is_null($content)) { |
|
89 |
$content = $this->buildTextvalue($node->textContent, $mLang); |
|
90 |
} elseif($node->nodeName === "TRANSL" && is_null($transl)) { |
|
91 |
$transl = $this->buildTextvalue($node->textContent, $node->getAttribute('xml:lang')); |
|
92 |
} elseif($node->nodeName === 'AUDIO') { |
|
93 |
$mbegin = $node->getAttribute('start'); |
|
94 |
$mend = $node->getAttribute('end'); |
|
95 |
} |
|
96 |
} |
|
97 |
$mDef = ['content' => $content, 'transl' => $transl]; |
|
98 |
||
99 |
if(!empty($mbegin) && !empty($mend)) { |
|
100 |
$mDef['begin'] = intval($mbegin); |
|
101 |
$mDef['end'] = intval($mend); |
|
102 |
} |
|
103 |
||
104 |
$mClass = $mNode->getAttribute('class'); |
|
105 |
if(!empty($mClass)) { |
|
106 |
$mDef['class'] = $mClass; |
|
107 |
} |
|
108 |
$mSclass = $mNode->getAttribute('sclass'); |
|
109 |
if(!empty($mSclass)) { |
|
110 |
$mDef['sclass'] = $mSclass; |
|
111 |
} |
|
112 |
||
113 |
return $mDef; |
|
162
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
114 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
115 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
116 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
117 |
public function buildAnnotations() { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
118 |
$xpath = new \DOMXPath($this->source); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
119 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
120 |
$annotationList = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
121 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
122 |
foreach($xpath->evaluate("/TEXT/S") as $i => $s) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
123 |
$data = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
124 |
$speaker = $s->getAttribute('who'); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
125 |
if($speaker) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
126 |
$data['speaker'] = $speaker; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
127 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
128 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
129 |
$annotation = [ |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
130 |
"id" => $this->document->getId()."_a".sprintf("%03d",$i+1), |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
131 |
"media" => $this->getMediaRefId(), |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
132 |
"content" => [ |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
133 |
"mimetype" => "application/json", |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
134 |
"data" => $data |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
135 |
] |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
136 |
]; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
137 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
138 |
foreach($s->childNodes as $cnode) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
139 |
if($cnode->nodeType === XML_ELEMENT_NODE) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
140 |
switch($cnode->tagName) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
141 |
case "AUDIO": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
142 |
$this->addAudioInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
143 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
144 |
case "FORM": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
145 |
$this->addFormInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
146 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
147 |
case "TRANSL": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
148 |
$this->addTranslInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
149 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
150 |
case "W": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
151 |
$this->addWordInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
152 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
153 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
154 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
155 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
156 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
157 |
array_push($annotationList, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
158 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
159 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
160 |
return $annotationList; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
161 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
162 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
163 |
|
461 | 164 |
} |