|
1 <?php |
|
2 |
|
3 namespace IRI\Bundle\WikiTagBundle\Utils; |
|
4 |
|
5 use IRI\Bundle\WikiTagBundle\Entity\Tag; |
|
6 |
|
7 class WikiTagUtils |
|
8 { |
|
9 // Constants |
|
10 private static $WIKIPEDIA_API_URL = "http://fr.wikipedia.org/w/api.php"; |
|
11 private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s"; |
|
12 private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s"; |
|
13 |
|
14 |
|
15 /** |
|
16 * Get or create tag. Returns an array(tag:WikiTagTag, revision_id=int, created:Boolean) |
|
17 */ |
|
18 |
|
19 /** |
|
20 * |
|
21 * Enter description here ... |
|
22 * @param unknown_type $tag_label |
|
23 * @param unknown_type $doctrine |
|
24 * @return multitype:boolean Ambigous <NULL, \IRI\Bundle\WikiTagBundle\Entity\Tag> Ambigous <NULL, unknown, mixed, string> (array(\IRI\Bundle\WikiTagBundle\Model\TagInterface, revision_id=int, created:Boolean)) |
|
25 */ |
|
26 public static function getOrCreateTag($tag_label, $doctrine) |
|
27 { |
|
28 $tag_label_normalized = WikiTagUtils::normalizeTag($tag_label); |
|
29 // We get the wikipedia references for the tag_label |
|
30 // We get or create the tag object |
|
31 $tags = $doctrine->getRepository('WikiTagBundle:Tag')->findBy(array('normalizedLabel' => $tag_label_normalized)); |
|
32 $tag = null; |
|
33 foreach ($tags as $t){ |
|
34 if($tag==null || $t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){ |
|
35 $tag = $t; |
|
36 if($t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){ |
|
37 break; |
|
38 } |
|
39 } |
|
40 } |
|
41 if($tag==null){ |
|
42 $tag = new Tag(); |
|
43 $tag->setLabel($tag_label_normalized); |
|
44 $tag->setOriginalLabel($tag_label); |
|
45 $tag->setNormalizedLabel($tag_label_normalized); |
|
46 $created = true; |
|
47 } |
|
48 else{ |
|
49 $created = false; |
|
50 //$created = true; |
|
51 } |
|
52 |
|
53 // We request Wikipedia if the tag is created |
|
54 if($created==true){ |
|
55 $wp_response = WikiTagUtils::getWikipediaInfo($tag_label_normalized); |
|
56 $new_label = $wp_response['new_label']; |
|
57 $status = $wp_response['status']; |
|
58 $url = $wp_response['wikipedia_url']; |
|
59 $pageid = $wp_response['pageid']; |
|
60 $dbpedia_uri = $wp_response["dbpedia_uri"]; |
|
61 $wikipedia_revision_id = $wp_response['revision_id']; |
|
62 |
|
63 # We save the datas |
|
64 if($new_label!=null){ |
|
65 $tag->setLabel($new_label); |
|
66 } |
|
67 if($status!=null){ |
|
68 $tag->setUrlStatus($status); |
|
69 } |
|
70 $tag->setWikipediaUrl($url); |
|
71 $tag->setWikipediaPageId($pageid); |
|
72 $tag->setDbpediaUri($dbpedia_uri); |
|
73 |
|
74 // Save datas. |
|
75 $em = $doctrine->getEntityManager(); |
|
76 $em->persist($tag); |
|
77 $em->flush(); |
|
78 |
|
79 } |
|
80 else if($tag!=null && $tag->getWikipediaPageId()!=null){ |
|
81 $wp_response = WikiTagUtils::getWikipediaInfo(null, $tag->getWikipediaPageId()); |
|
82 $wikipedia_revision_id = $wp_response['revision_id']; |
|
83 } |
|
84 else{ |
|
85 $wikipedia_revision_id = null; |
|
86 } |
|
87 |
|
88 return array($tag, $wikipedia_revision_id, $created);//, $wpReponse); |
|
89 } |
|
90 |
|
91 /** |
|
92 * Cleans the tag label |
|
93 */ |
|
94 private static function normalizeTag($tag_label) |
|
95 { |
|
96 if(strlen($tag_label)==0){ |
|
97 return $tag_label; |
|
98 } |
|
99 $tag_label = trim($tag_label);//tag.strip() |
|
100 $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ") |
|
101 $tag_label = preg_replace('/\s+/', ' ', $tag_label);//" ".join(tag.split()) |
|
102 $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:] |
|
103 return $tag_label; |
|
104 } |
|
105 |
|
106 /** |
|
107 * |
|
108 * TODO: Enter description here ... |
|
109 * @param unknown_type $tag_label_normalized |
|
110 * @param unknown_type $page_id |
|
111 * @return multitype:NULL unknown |multitype:Ambigous <NULL, unknown> multitype:number mixed Ambigous <NULL, string> Ambigous <unknown, mixed> |
|
112 */ |
|
113 private static function getWikipediaInfo($tag_label_normalized, $page_id=null) |
|
114 { |
|
115 $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json'); |
|
116 if($tag_label_normalized!=null){ |
|
117 $params['titles'] = urlencode($tag_label_normalized); |
|
118 } |
|
119 else if($page_id!=null){ |
|
120 $params['pageids'] = $page_id; |
|
121 } |
|
122 else{ |
|
123 return WikiTagUtils::returnNullResult(null); |
|
124 } |
|
125 |
|
126 $ar = WikiTagUtils::requestWikipedia($params); |
|
127 $res = $ar[0]; |
|
128 $original_response = $res; |
|
129 $pages = $ar[1]; |
|
130 // If there 0 or more than 1 result, the query has failed |
|
131 if(count($pages)>1 || count($pages)==0){ |
|
132 return WikiTagUtils::returnNullResult($res); |
|
133 } |
|
134 // get first result |
|
135 $page = reset($pages); |
|
136 // Unknow entry ? |
|
137 if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){ |
|
138 return WikiTagUtils::returnNullResult($res); |
|
139 } |
|
140 // The entry exists, we get the datas. |
|
141 $url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null; |
|
142 $pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null; |
|
143 $new_label = array_key_exists('title', $page) ? $page['title'] : null; |
|
144 // We test the status (redirect first because a redirect has no categories key) |
|
145 if(array_key_exists('redirect', $page)){ |
|
146 //return " REDIRECT"; |
|
147 $status = Tag::$TAG_URL_STATUS_DICT["redirection"]; |
|
148 } |
|
149 else if(WikiTagUtils::isHomonymy($page)){ |
|
150 //return " HOMONYMY"; |
|
151 $status = Tag::$TAG_URL_STATUS_DICT["homonyme"]; |
|
152 } |
|
153 else{ |
|
154 //return " MATCH"; |
|
155 $status = Tag::$TAG_URL_STATUS_DICT["match"]; |
|
156 } |
|
157 // In redirection, we have to get more datas by adding redirects=true to the params |
|
158 if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]){ |
|
159 $params['redirects'] = "true"; |
|
160 $ar = WikiTagUtils::requestWikipedia($params); |
|
161 $res = $ar[0]; |
|
162 $pages = $ar[1]; |
|
163 #we know that we have at least one answer |
|
164 if(count($pages)>1 || count($pages)==0){ |
|
165 return WikiTagUtils::returnNullResult($res); |
|
166 } |
|
167 // get first result |
|
168 $page = reset($pages); |
|
169 } |
|
170 |
|
171 $revision_id = $page['lastrevid']; |
|
172 |
|
173 // process language to extract the english label |
|
174 $english_label = null; |
|
175 if($status==Tag::$TAG_URL_STATUS_DICT["match"] || $status==Tag::$TAG_URL_STATUS_DICT["redirection"]){ |
|
176 if(array_key_exists("langlinks", $page)){ |
|
177 foreach ($page["langlinks"] as $ar) { |
|
178 if($ar["lang"]=="en"){ |
|
179 $english_label = $ar["*"]; |
|
180 break; |
|
181 } |
|
182 } |
|
183 } |
|
184 } |
|
185 // We create the dbpedia uri. |
|
186 $dbpedia_uri = null; |
|
187 if($english_label!=null && strpos($english_label, '#')===false){ |
|
188 $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label); |
|
189 } |
|
190 |
|
191 $wp_response = array('new_label'=>$new_label, 'status'=>$status, 'wikipedia_url'=>$url, 'pageid'=>$pageid, 'dbpedia_uri'=>$dbpedia_uri, 'revision_id'=>$revision_id, 'response'=>$original_response); |
|
192 //return $url." <br/>RES = ".$res/*." <br/>DUMP = ".var_dump($pages)*/." <br/>COUNT = ".count($pages)." <br/>page = ".var_dump($page); |
|
193 return $wp_response; |
|
194 } |
|
195 |
|
196 |
|
197 /** |
|
198 * |
|
199 * TODO : Enter description here ... |
|
200 * @param unknown_type $params |
|
201 * @return multitype:unknown mixed |
|
202 */ |
|
203 private static function requestWikipedia($params) |
|
204 { |
|
205 $params_str = ''; |
|
206 foreach ($params as $key => $value) { |
|
207 if ($params_str==''){ |
|
208 $params_str = $key.'='.$value; |
|
209 } |
|
210 else{ |
|
211 $params_str .= '&'.$key.'='.$value; |
|
212 } |
|
213 } |
|
214 |
|
215 $url = WikiTagUtils::$WIKIPEDIA_API_URL.'?'.$params_str; |
|
216 |
|
217 $ch = curl_init(); |
|
218 curl_setopt($ch, CURLOPT_URL, $url); |
|
219 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); |
|
220 curl_setopt($ch, CURLOPT_USERAGENT, 'http://www.iri.centrepompidou.fr'); |
|
221 $res = curl_exec($ch); |
|
222 curl_close($ch); |
|
223 |
|
224 $val = json_decode($res, true); |
|
225 $pages = $val["query"]["pages"]; |
|
226 return array($res, $pages); |
|
227 } |
|
228 |
|
229 /** |
|
230 * Returns tag with a null result, usually used after a failed request on Wikipedia |
|
231 */ |
|
232 private static function returnNullResult($response) |
|
233 { |
|
234 return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response); |
|
235 } |
|
236 |
|
237 /** |
|
238 * Returns tag with a null result, usually used after a failed request on Wikipedia |
|
239 */ |
|
240 private static function isHomonymy($page) |
|
241 { |
|
242 //$s = ""; |
|
243 foreach ($page["categories"] as $ar) { |
|
244 //$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie'); |
|
245 // Strict test because false can be seen as "O". |
|
246 if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){ |
|
247 //$s .= "TRUE"; |
|
248 return true; |
|
249 } |
|
250 } |
|
251 return false; |
|
252 } |
|
253 |
|
254 /** |
|
255 * Builds DbPedia URI |
|
256 */ |
|
257 private static function getDbpediaUri($english_label) |
|
258 { |
|
259 return sprintf(WikiTagUtils::$DBPEDIA_URI_TEMPLATE, WikiTagUtils::urlize_for_wikipedia($english_label)); |
|
260 } |
|
261 |
|
262 /** |
|
263 * URLencode label for wikipedia |
|
264 */ |
|
265 private static function urlize_for_wikipedia($label){ |
|
266 return urlencode(str_replace(" ", "_", $label)); |
|
267 } |
|
268 } |