author | ymh <ymh.work@gmail.com> |
Thu, 09 Jun 2016 17:11:14 +0200 | |
changeset 182 | 1bcc373adabb |
parent 162 | a6cf5a06f02d |
child 461 | 9b7a6c099870 |
permissions | -rw-r--r-- |
162
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
1 |
<?php |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
2 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
3 |
namespace CorpusParole\Libraries\Transcript; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
4 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
5 |
class LacitoTranscriptConverter extends TranscriptConverterBase { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
6 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
7 |
// get document title |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
8 |
public function getSourceTitle() { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
9 |
$baseNode = $this->source->documentElement; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
10 |
$defaultLanguage = $baseNode->hasAttribute('xml:lang')?$baseNode->getAttribute('xml:lang'):'und'; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
11 |
$resTitle = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
12 |
$xpath = new \DOMXPath($this->source); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
13 |
foreach($xpath->evaluate("/TEXT/HEADER/TITLE") as $headerTitle) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
14 |
$resTitle[$headerTitle->hasAttribute("xml:lang")?$headerTitle->getAttribute('xml:lang'):$defaultLanguage] = (string)$headerTitle->textContent; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
15 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
16 |
return $resTitle?:null; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
17 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
18 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
19 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
20 |
// do nothing |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
21 |
public function parseSource() { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
22 |
// do nothing |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
23 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
24 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
25 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
26 |
private function addAudioInfo($audioNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
27 |
$annotation['begin'] = floatval($audioNode->getAttribute('start'))*1000; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
28 |
$annotation['end'] = floatval($audioNode->getAttribute('end'))*1000; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
29 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
30 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
31 |
private function addFormInfo($formNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
32 |
$annotation['content']['data']['content'] = $formNode->textContent; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
33 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
34 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
35 |
private function addTranslInfo($translNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
36 |
$annotation['content']['data']['transl'] |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
37 |
= $this->buildTextvalue($translNode->textContent, $translNode->getAttribute('xml:lang')); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
38 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
39 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
40 |
private function addWordInfo($wNode, &$annotation) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
41 |
if(!array_key_exists('words',$annotation['content']['data'])) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
42 |
$annotation['content']['data']['words'] = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
43 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
44 |
$content = ""; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
45 |
$formNodes = $wNode->getElementsByTagName('FORM'); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
46 |
if(count($formNodes) !== 0 ) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
47 |
$content = $formNodes[0]->textContent; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
48 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
49 |
$transl = ""; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
50 |
$translNodes = $wNode->getElementsByTagName('TRANSL'); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
51 |
if(count($translNodes) !== 0) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
52 |
$transl = $this->buildTextvalue($translNodes[0]->textContent, $translNodes[0]->getAttribute('xml:lang')); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
53 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
54 |
array_push($annotation['content']['data']['words'], ['content' => $content, 'transl' => $transl]); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
55 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
56 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
57 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
58 |
public function buildAnnotations() { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
59 |
$xpath = new \DOMXPath($this->source); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
60 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
61 |
$annotationList = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
62 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
63 |
foreach($xpath->evaluate("/TEXT/S") as $i => $s) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
64 |
$data = []; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
65 |
$speaker = $s->getAttribute('who'); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
66 |
if($speaker) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
67 |
$data['speaker'] = $speaker; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
68 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
69 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
70 |
$annotation = [ |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
71 |
"id" => $this->document->getId()."_a".sprintf("%03d",$i+1), |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
72 |
"media" => $this->getMediaRefId(), |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
73 |
"content" => [ |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
74 |
"mimetype" => "application/json", |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
75 |
"data" => $data |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
76 |
] |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
77 |
]; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
78 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
79 |
foreach($s->childNodes as $cnode) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
80 |
if($cnode->nodeType === XML_ELEMENT_NODE) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
81 |
switch($cnode->tagName) { |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
82 |
case "AUDIO": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
83 |
$this->addAudioInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
84 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
85 |
case "FORM": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
86 |
$this->addFormInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
87 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
88 |
case "TRANSL": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
89 |
$this->addTranslInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
90 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
91 |
case "W": |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
92 |
$this->addWordInfo($cnode, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
93 |
break; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
94 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
95 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
96 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
97 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
98 |
array_push($annotationList, $annotation); |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
99 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
100 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
101 |
return $annotationList; |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
102 |
} |
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
103 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
104 |
|
a6cf5a06f02d
add Transcript converters libraries + test
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
105 |
} |