| author | cavaliet |
| Wed, 30 Nov 2011 17:58:18 +0100 | |
| changeset 50 | e967654e90cb |
| parent 43 | 54f204bceb28 |
| child 60 | 984ba20c150b |
| permissions | -rwxr-xr-x |
| 2 | 1 |
<?php |
2 |
||
3 |
namespace IRI\Bundle\WikiTagBundle\Utils; |
|
4 |
||
5 |
use IRI\Bundle\WikiTagBundle\Entity\Tag; |
|
6 |
||
7 |
class WikiTagUtils |
|
8 |
{ |
|
9 |
// Constants |
|
10 |
private static $WIKIPEDIA_API_URL = "http://fr.wikipedia.org/w/api.php"; |
|
11 |
private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s"; |
|
12 |
private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s"; |
|
13 |
||
14 |
||
15 |
/** |
|
16 |
* Cleans the tag label |
|
17 |
*/ |
|
| 8 | 18 |
public static function normalizeTag($tag_label) |
| 2 | 19 |
{ |
20 |
if(strlen($tag_label)==0){ |
|
21 |
return $tag_label; |
|
22 |
} |
|
23 |
$tag_label = trim($tag_label);//tag.strip() |
|
24 |
$tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ") |
|
| 43 | 25 |
$tag_label = preg_replace('/\s+/u', ' ', $tag_label);//" ".join(tag.split()) |
| 2 | 26 |
$tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:] |
27 |
return $tag_label; |
|
28 |
} |
|
29 |
||
30 |
/** |
|
| 43 | 31 |
* Query wikipedia with a normalized label or a pageid |
32 |
* return an array with the form |
|
33 |
* array( |
|
34 |
* 'new_label'=>$new_label, |
|
35 |
* 'alternative_label'=>$alternative_label, |
|
36 |
* 'status'=>$status, |
|
37 |
* 'wikipedia_url'=>$url, |
|
38 |
* 'wikipedia_alternative_url'=>$alternative_url, |
|
39 |
* 'pageid'=>$pageid, |
|
40 |
* 'alternative_pageid'=>$alternative_pageid, |
|
41 |
* 'dbpedia_uri'=>$dbpedia_uri, |
|
42 |
* 'revision_id'=> , |
|
43 |
* 'response'=> the original wikipedia json response |
|
44 |
* ) |
|
| 2 | 45 |
* |
| 43 | 46 |
* @param string $tag_label_normalized |
47 |
* @param bigint $page_id |
|
48 |
* @return array |
|
| 2 | 49 |
*/ |
|
22
99c15cfe420b
Add ModifyTag from tag list. Add Reset Wikipedia info from tag list. Enable boolean from addJavascript controller/template to switch from list or document context.
cavaliet
parents:
9
diff
changeset
|
50 |
public static function getWikipediaInfo($tag_label_normalized, $page_id=null) |
| 2 | 51 |
{ |
52 |
$params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json'); |
|
53 |
if($tag_label_normalized!=null){ |
|
54 |
$params['titles'] = urlencode($tag_label_normalized); |
|
55 |
} |
|
56 |
else if($page_id!=null){ |
|
57 |
$params['pageids'] = $page_id; |
|
58 |
} |
|
59 |
else{ |
|
60 |
return WikiTagUtils::returnNullResult(null); |
|
61 |
} |
|
62 |
||
|
50
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
63 |
try { |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
64 |
$ar = WikiTagUtils::requestWikipedia($params); |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
65 |
} |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
66 |
catch (\Exception $e){ |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
67 |
throw new \Exception($e->getMessage()); |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
68 |
} |
| 2 | 69 |
$res = $ar[0]; |
70 |
$original_response = $res; |
|
71 |
$pages = $ar[1]; |
|
72 |
// If there 0 or more than 1 result, the query has failed |
|
73 |
if(count($pages)>1 || count($pages)==0){ |
|
74 |
return WikiTagUtils::returnNullResult($res); |
|
75 |
} |
|
76 |
// get first result |
|
77 |
$page = reset($pages); |
|
78 |
// Unknow entry ? |
|
79 |
if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){ |
|
80 |
return WikiTagUtils::returnNullResult($res); |
|
81 |
} |
|
82 |
// The entry exists, we get the datas. |
|
83 |
$url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null; |
|
84 |
$pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null; |
|
85 |
$new_label = array_key_exists('title', $page) ? $page['title'] : null; |
|
86 |
// We test the status (redirect first because a redirect has no categories key) |
|
87 |
if(array_key_exists('redirect', $page)){ |
|
88 |
//return " REDIRECT"; |
|
89 |
$status = Tag::$TAG_URL_STATUS_DICT["redirection"]; |
|
90 |
} |
|
91 |
else if(WikiTagUtils::isHomonymy($page)){ |
|
92 |
//return " HOMONYMY"; |
|
93 |
$status = Tag::$TAG_URL_STATUS_DICT["homonyme"]; |
|
94 |
} |
|
95 |
else{ |
|
96 |
//return " MATCH"; |
|
97 |
$status = Tag::$TAG_URL_STATUS_DICT["match"]; |
|
98 |
} |
|
99 |
// In redirection, we have to get more datas by adding redirects=true to the params |
|
|
42
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
100 |
$alternative_label = null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
101 |
$alternative_url = null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
102 |
$alternative_pageid = null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
103 |
if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]) |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
104 |
{ |
| 2 | 105 |
$params['redirects'] = "true"; |
106 |
$ar = WikiTagUtils::requestWikipedia($params); |
|
107 |
$res = $ar[0]; |
|
108 |
$pages = $ar[1]; |
|
109 |
#we know that we have at least one answer |
|
110 |
if(count($pages)>1 || count($pages)==0){ |
|
111 |
return WikiTagUtils::returnNullResult($res); |
|
112 |
} |
|
113 |
// get first result |
|
114 |
$page = reset($pages); |
|
|
42
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
115 |
$alternative_label = array_key_exists('title', $page) ? $page['title'] : null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
116 |
$alternative_url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
117 |
$alternative_pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null; |
| 2 | 118 |
} |
119 |
||
120 |
$revision_id = $page['lastrevid']; |
|
121 |
||
122 |
// process language to extract the english label |
|
123 |
$english_label = null; |
|
124 |
if($status==Tag::$TAG_URL_STATUS_DICT["match"] || $status==Tag::$TAG_URL_STATUS_DICT["redirection"]){ |
|
125 |
if(array_key_exists("langlinks", $page)){ |
|
126 |
foreach ($page["langlinks"] as $ar) { |
|
127 |
if($ar["lang"]=="en"){ |
|
128 |
$english_label = $ar["*"]; |
|
129 |
break; |
|
130 |
} |
|
131 |
} |
|
132 |
} |
|
133 |
} |
|
134 |
// We create the dbpedia uri. |
|
135 |
$dbpedia_uri = null; |
|
136 |
if($english_label!=null && strpos($english_label, '#')===false){ |
|
137 |
$dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label); |
|
138 |
} |
|
139 |
||
|
42
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
140 |
$wp_response = array( |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
141 |
'new_label'=>$new_label, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
142 |
'alternative_label'=>$alternative_label, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
143 |
'status'=>$status, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
144 |
'wikipedia_url'=>$url, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
145 |
'wikipedia_alternative_url'=>$alternative_url, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
146 |
'pageid'=>$pageid, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
147 |
'alternative_pageid'=>$alternative_pageid, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
148 |
'dbpedia_uri'=>$dbpedia_uri, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
149 |
'revision_id'=>$revision_id, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
150 |
'response'=>$original_response); |
| 2 | 151 |
//return $url." <br/>RES = ".$res/*." <br/>DUMP = ".var_dump($pages)*/." <br/>COUNT = ".count($pages)." <br/>page = ".var_dump($page); |
152 |
return $wp_response; |
|
153 |
} |
|
154 |
||
155 |
||
156 |
/** |
|
| 43 | 157 |
* build and do the request to Wikipedia. |
| 2 | 158 |
* |
| 43 | 159 |
* @param array $params |
160 |
* @return array |
|
| 2 | 161 |
*/ |
162 |
private static function requestWikipedia($params) |
|
163 |
{ |
|
164 |
$params_str = ''; |
|
165 |
foreach ($params as $key => $value) { |
|
166 |
if ($params_str==''){ |
|
167 |
$params_str = $key.'='.$value; |
|
168 |
} |
|
169 |
else{ |
|
170 |
$params_str .= '&'.$key.'='.$value; |
|
171 |
} |
|
172 |
} |
|
173 |
||
174 |
$url = WikiTagUtils::$WIKIPEDIA_API_URL.'?'.$params_str; |
|
175 |
||
176 |
$ch = curl_init(); |
|
177 |
curl_setopt($ch, CURLOPT_URL, $url); |
|
178 |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); |
|
179 |
curl_setopt($ch, CURLOPT_USERAGENT, 'http://www.iri.centrepompidou.fr'); |
|
|
50
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
180 |
curl_setopt($ch, CURLOPT_TIMEOUT_MS, 5000); |
| 2 | 181 |
$res = curl_exec($ch); |
|
50
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
182 |
$curl_errno = curl_errno($ch); |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
183 |
$curl_error = curl_error($ch); |
| 2 | 184 |
curl_close($ch); |
185 |
||
|
50
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
186 |
if ($curl_errno > 0) { |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
187 |
throw new \Exception("Wikipedia request failed. cURLError #$curl_errno: $curl_error\n"); |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
188 |
} |
|
e967654e90cb
First step of error management when Wikipedia request fails. Set up in whole list and document list.
cavaliet
parents:
43
diff
changeset
|
189 |
|
| 2 | 190 |
$val = json_decode($res, true); |
191 |
$pages = $val["query"]["pages"]; |
|
192 |
return array($res, $pages); |
|
193 |
} |
|
194 |
||
195 |
/** |
|
196 |
* Returns tag with a null result, usually used after a failed request on Wikipedia |
|
197 |
*/ |
|
198 |
private static function returnNullResult($response) |
|
199 |
{ |
|
200 |
return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response); |
|
201 |
} |
|
202 |
||
203 |
/** |
|
204 |
* Returns tag with a null result, usually used after a failed request on Wikipedia |
|
205 |
*/ |
|
206 |
private static function isHomonymy($page) |
|
207 |
{ |
|
208 |
//$s = ""; |
|
209 |
foreach ($page["categories"] as $ar) { |
|
210 |
//$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie'); |
|
211 |
// Strict test because false can be seen as "O". |
|
212 |
if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){ |
|
213 |
//$s .= "TRUE"; |
|
214 |
return true; |
|
215 |
} |
|
216 |
} |
|
217 |
return false; |
|
218 |
} |
|
219 |
||
220 |
/** |
|
221 |
* Builds DbPedia URI |
|
222 |
*/ |
|
223 |
private static function getDbpediaUri($english_label) |
|
224 |
{ |
|
225 |
return sprintf(WikiTagUtils::$DBPEDIA_URI_TEMPLATE, WikiTagUtils::urlize_for_wikipedia($english_label)); |
|
226 |
} |
|
227 |
||
228 |
/** |
|
229 |
* URLencode label for wikipedia |
|
230 |
*/ |
|
231 |
private static function urlize_for_wikipedia($label){ |
|
232 |
return urlencode(str_replace(" ", "_", $label)); |
|
233 |
} |
|
234 |
} |