| author | cavaliet |
| Tue, 04 Feb 2014 11:23:12 +0100 | |
| changeset 120 | 6fd1ff318825 |
| parent 117 | 5771052a647a |
| child 129 | 65c12455ce74 |
| permissions | -rwxr-xr-x |
| 2 | 1 |
<?php |
|
74
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
2 |
/* |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
3 |
* This file is part of the WikiTagBundle package. |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
4 |
* |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
5 |
* (c) IRI <http://www.iri.centrepompidou.fr/> |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
6 |
* |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
7 |
* For the full copyright and license information, please view the LICENSE |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
8 |
* file that was distributed with this source code. |
|
901463f9b11c
add headers for public repository release
ymh <ymh.work@gmail.com>
parents:
68
diff
changeset
|
9 |
*/ |
| 2 | 10 |
|
11 |
namespace IRI\Bundle\WikiTagBundle\Utils; |
|
12 |
||
13 |
use IRI\Bundle\WikiTagBundle\Entity\Tag; |
|
14 |
||
15 |
class WikiTagUtils |
|
|
112
14653baf4f6b
first change for wikipedia and dbpedia lang configuration
cavaliet
parents:
77
diff
changeset
|
16 |
{ |
| 2 | 17 |
/** |
18 |
* Cleans the tag label |
|
19 |
*/ |
|
| 8 | 20 |
public static function normalizeTag($tag_label) |
| 2 | 21 |
{ |
22 |
if(strlen($tag_label)==0){ |
|
23 |
return $tag_label; |
|
24 |
} |
|
25 |
$tag_label = trim($tag_label);//tag.strip() |
|
26 |
$tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ") |
|
| 43 | 27 |
$tag_label = preg_replace('/\s+/u', ' ', $tag_label);//" ".join(tag.split()) |
| 2 | 28 |
$tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:] |
29 |
return $tag_label; |
|
30 |
} |
|
31 |
||
32 |
/** |
|
| 43 | 33 |
* Query wikipedia with a normalized label or a pageid |
34 |
* return an array with the form |
|
35 |
* array( |
|
36 |
* 'new_label'=>$new_label, |
|
37 |
* 'alternative_label'=>$alternative_label, |
|
38 |
* 'status'=>$status, |
|
39 |
* 'wikipedia_url'=>$url, |
|
40 |
* 'wikipedia_alternative_url'=>$alternative_url, |
|
41 |
* 'pageid'=>$pageid, |
|
42 |
* 'alternative_pageid'=>$alternative_pageid, |
|
43 |
* 'dbpedia_uri'=>$dbpedia_uri, |
|
44 |
* 'revision_id'=> , |
|
45 |
* 'response'=> the original wikipedia json response |
|
46 |
* ) |
|
| 2 | 47 |
* |
| 43 | 48 |
* @param string $tag_label_normalized |
49 |
* @param bigint $page_id |
|
50 |
* @return array |
|
| 2 | 51 |
*/ |
|
68
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
52 |
public static function getWikipediaInfo($tag_label_normalized, $page_id=null, $ignore_wikipedia_error=false, $logger = null) |
| 2 | 53 |
{ |
|
68
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
54 |
|
| 2 | 55 |
$params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json'); |
56 |
if($tag_label_normalized!=null){ |
|
57 |
$params['titles'] = urlencode($tag_label_normalized); |
|
58 |
} |
|
59 |
else if($page_id!=null){ |
|
60 |
$params['pageids'] = $page_id; |
|
61 |
} |
|
62 |
else{ |
|
63 |
return WikiTagUtils::returnNullResult(null); |
|
64 |
} |
|
65 |
||
| 63 | 66 |
try { |
67 |
$ar = WikiTagUtils::requestWikipedia($params); |
|
68 |
} |
|
69 |
catch(\Exception $e) { |
|
70 |
if($ignore_wikipedia_error) { |
|
|
68
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
71 |
if(!is_null($logger)) { |
|
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
72 |
$logger->err("Error when querying wikipedia : ".$e->getMessage()." with trace : ".$e->getTraceAsString()); |
|
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
73 |
} |
| 63 | 74 |
return WikiTagUtils::returnNullResult(null); |
75 |
} |
|
76 |
else { |
|
77 |
throw $e; |
|
78 |
} |
|
79 |
} |
|
| 60 | 80 |
|
| 2 | 81 |
$res = $ar[0]; |
82 |
$original_response = $res; |
|
83 |
$pages = $ar[1]; |
|
84 |
// If there 0 or more than 1 result, the query has failed |
|
85 |
if(count($pages)>1 || count($pages)==0){ |
|
86 |
return WikiTagUtils::returnNullResult($res); |
|
87 |
} |
|
88 |
// get first result |
|
89 |
$page = reset($pages); |
|
90 |
// Unknow entry ? |
|
91 |
if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){ |
|
92 |
return WikiTagUtils::returnNullResult($res); |
|
93 |
} |
|
94 |
// The entry exists, we get the datas. |
|
95 |
$url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null; |
|
96 |
$pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null; |
|
97 |
$new_label = array_key_exists('title', $page) ? $page['title'] : null; |
|
98 |
// We test the status (redirect first because a redirect has no categories key) |
|
99 |
if(array_key_exists('redirect', $page)){ |
|
100 |
//return " REDIRECT"; |
|
101 |
$status = Tag::$TAG_URL_STATUS_DICT["redirection"]; |
|
102 |
} |
|
103 |
else if(WikiTagUtils::isHomonymy($page)){ |
|
104 |
//return " HOMONYMY"; |
|
105 |
$status = Tag::$TAG_URL_STATUS_DICT["homonyme"]; |
|
106 |
} |
|
107 |
else{ |
|
108 |
//return " MATCH"; |
|
109 |
$status = Tag::$TAG_URL_STATUS_DICT["match"]; |
|
110 |
} |
|
111 |
// In redirection, we have to get more datas by adding redirects=true to the params |
|
|
42
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
112 |
$alternative_label = null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
113 |
$alternative_url = null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
114 |
$alternative_pageid = null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
115 |
if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]) |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
116 |
{ |
| 2 | 117 |
$params['redirects'] = "true"; |
| 63 | 118 |
try { |
119 |
$ar = WikiTagUtils::requestWikipedia($params); |
|
120 |
} |
|
121 |
catch(\Exception $e) { |
|
122 |
if($ignore_wikipedia_error) { |
|
|
68
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
123 |
if(!is_null($logger)) { |
|
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
124 |
$logger->error("Error when querying wikipedia for redirection : ".$e->getMessage()." with trace : ".$e->getTraceAsString()); |
|
e7384fb35f7a
improve search test and documentation
ymh <ymh.work@gmail.com>
parents:
67
diff
changeset
|
125 |
} |
| 63 | 126 |
return WikiTagUtils::returnNullResult(null); |
127 |
} |
|
128 |
else { |
|
129 |
throw $e; |
|
130 |
} |
|
131 |
} |
|
132 |
||
| 2 | 133 |
$res = $ar[0]; |
134 |
$pages = $ar[1]; |
|
| 120 | 135 |
// we know that we have at least one answer |
| 2 | 136 |
if(count($pages)>1 || count($pages)==0){ |
137 |
return WikiTagUtils::returnNullResult($res); |
|
138 |
} |
|
139 |
// get first result |
|
140 |
$page = reset($pages); |
|
|
42
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
141 |
$alternative_label = array_key_exists('title', $page) ? $page['title'] : null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
142 |
$alternative_url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null; |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
143 |
$alternative_pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null; |
| 2 | 144 |
} |
145 |
||
146 |
$revision_id = $page['lastrevid']; |
|
147 |
||
| 120 | 148 |
// Get the dbpedia uri by requesting dbpedia with sparql |
| 115 | 149 |
$dbpedia_uri = WikiTagUtils::getDbpediaUri($new_label); |
| 2 | 150 |
|
|
42
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
151 |
$wp_response = array( |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
152 |
'new_label'=>$new_label, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
153 |
'alternative_label'=>$alternative_label, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
154 |
'status'=>$status, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
155 |
'wikipedia_url'=>$url, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
156 |
'wikipedia_alternative_url'=>$alternative_url, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
157 |
'pageid'=>$pageid, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
158 |
'alternative_pageid'=>$alternative_pageid, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
159 |
'dbpedia_uri'=>$dbpedia_uri, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
160 |
'revision_id'=>$revision_id, |
|
0e57c730bb18
Documentation and add alternative wp url and label + migrations
ymh <ymh.work@gmail.com>
parents:
32
diff
changeset
|
161 |
'response'=>$original_response); |
| 63 | 162 |
|
| 2 | 163 |
return $wp_response; |
164 |
} |
|
165 |
||
| 115 | 166 |
/** |
167 |
* Generic curl request |
|
168 |
* |
|
169 |
* @param string $url |
|
170 |
* @return object (json decoded) |
|
171 |
*/ |
|
| 116 | 172 |
private static function curlRequest($url, $throw_error=true) |
| 115 | 173 |
{ |
174 |
$ch = curl_init(); |
|
175 |
curl_setopt($ch, CURLOPT_URL, $url); |
|
176 |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); |
|
177 |
// default values |
|
178 |
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0'); |
|
179 |
curl_setopt($ch, CURLOPT_TIMEOUT_MS, 5000); |
|
180 |
// Set options if they are set in the config.yml file, typically for proxy configuration. |
|
181 |
// Thanks to the configuration file, it will execute commands like "curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);" or "curl_setopt($ch, CURLOPT_PROXY, "xxx.yyy.zzz:PORT");" |
|
182 |
$curl_options = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.curl_options"); |
|
183 |
foreach ($curl_options as $key => $value) { |
|
184 |
if(strtoupper($value)=='TRUE'){ |
|
185 |
$value = TRUE; |
|
186 |
} |
|
187 |
else if (strtoupper($value)=='FALSE'){ |
|
188 |
$value = FALSE; |
|
189 |
} |
|
190 |
else if (defined($value)){ |
|
191 |
$value = constant($value); |
|
192 |
} |
|
193 |
curl_setopt($ch, constant($key), $value); |
|
194 |
} |
|
195 |
// end of treatment |
|
196 |
$res = curl_exec($ch); |
|
197 |
$curl_errno = curl_errno($ch); |
|
198 |
$curl_error = curl_error($ch); |
|
199 |
curl_close($ch); |
|
200 |
||
| 116 | 201 |
if ($curl_errno > 0 && $throw_error) { |
| 115 | 202 |
throw new \Exception("$url\n request failed. cURLError #$curl_errno: $curl_error\n", $curl_errno, null); |
203 |
} |
|
204 |
||
205 |
return $res; |
|
206 |
} |
|
207 |
||
| 2 | 208 |
|
209 |
/** |
|
| 43 | 210 |
* build and do the request to Wikipedia. |
| 2 | 211 |
* |
| 43 | 212 |
* @param array $params |
213 |
* @return array |
|
| 2 | 214 |
*/ |
215 |
private static function requestWikipedia($params) |
|
216 |
{ |
|
217 |
$params_str = ''; |
|
218 |
foreach ($params as $key => $value) { |
|
219 |
if ($params_str==''){ |
|
220 |
$params_str = $key.'='.$value; |
|
221 |
} |
|
222 |
else{ |
|
223 |
$params_str .= '&'.$key.'='.$value; |
|
224 |
} |
|
225 |
} |
|
226 |
||
|
112
14653baf4f6b
first change for wikipedia and dbpedia lang configuration
cavaliet
parents:
77
diff
changeset
|
227 |
$url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["wikipedia_api"].'?'.$params_str; |
| 2 | 228 |
|
| 115 | 229 |
$res = WikiTagUtils::curlRequest($url); |
| 2 | 230 |
$val = json_decode($res, true); |
231 |
$pages = $val["query"]["pages"]; |
|
232 |
return array($res, $pages); |
|
233 |
} |
|
234 |
||
235 |
/** |
|
236 |
* Returns tag with a null result, usually used after a failed request on Wikipedia |
|
237 |
*/ |
|
238 |
private static function returnNullResult($response) |
|
239 |
{ |
|
240 |
return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response); |
|
241 |
} |
|
242 |
||
243 |
/** |
|
244 |
* Returns tag with a null result, usually used after a failed request on Wikipedia |
|
245 |
*/ |
|
246 |
private static function isHomonymy($page) |
|
247 |
{ |
|
248 |
//$s = ""; |
|
249 |
foreach ($page["categories"] as $ar) { |
|
250 |
//$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie'); |
|
251 |
// Strict test because false can be seen as "O". |
|
252 |
if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){ |
|
253 |
//$s .= "TRUE"; |
|
254 |
return true; |
|
255 |
} |
|
256 |
} |
|
257 |
return false; |
|
258 |
} |
|
259 |
||
260 |
/** |
|
261 |
* Builds DbPedia URI |
|
262 |
*/ |
|
| 117 | 263 |
public static function getDbpediaUri($label, $params=[], $throw_error=true, $req_param="label") |
| 2 | 264 |
{ |
| 115 | 265 |
// Get lang from url |
266 |
$dbp_url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"]; |
|
267 |
$lang = substr($dbp_url, 7, 2); |
|
| 120 | 268 |
// filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL" |
| 117 | 269 |
$query = 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }'; |
270 |
if($req_param=="pageid"){ |
|
271 |
$query = 'select distinct * where { ?s dbpedia-owl:wikiPageID '.$label.' }'; |
|
272 |
} |
|
273 |
elseif ($req_param=="wikiurl"){ |
|
274 |
$query = 'select distinct * where { ?s foaf:isPrimaryTopicOf <'.$label.'> }'; |
|
275 |
} |
|
276 |
||
| 115 | 277 |
$params = [ |
| 117 | 278 |
"query" => $query, |
| 115 | 279 |
"format" => 'application/json', |
280 |
]; |
|
| 116 | 281 |
|
| 115 | 282 |
$params_str = ''; |
283 |
foreach ($params as $key => $value) { |
|
284 |
if ($params_str==''){ |
|
285 |
$params_str = $key.'='.urlencode($value); |
|
286 |
} |
|
287 |
else{ |
|
288 |
$params_str .= '&'.$key.'='.urlencode($value); |
|
289 |
} |
|
290 |
} |
|
291 |
||
292 |
$url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"].'?'.$params_str; |
|
293 |
||
| 116 | 294 |
$res = WikiTagUtils::curlRequest($url, $throw_error); |
| 115 | 295 |
$val = json_decode($res, true); |
296 |
$uri = ""; |
|
| 116 | 297 |
if($val){ |
298 |
if(array_key_exists("results", $val)){ |
|
299 |
if(array_key_exists("bindings", $val["results"])){ |
|
300 |
$len = count($val["results"]["bindings"]); |
|
301 |
if($len > 0){ |
|
302 |
$uri = $val["results"]["bindings"][0]["s"]["value"]; |
|
303 |
if($len>1){ |
|
304 |
// If there are several results, we test the "url label" to see if it matches the label. |
|
305 |
// Why ? Because, for example "1000" gets "Category:1000" and "1000" as result. |
|
306 |
// We keep this code to be safe but the regexp in the sparql request normally avoids this problem. |
|
307 |
for($i=0;$i<$len;$i++){ |
|
308 |
$res_uri = $val["results"]["bindings"][$i]["s"]["value"]; |
|
309 |
$url_label = substr( $res_uri, strrpos( $res_uri, '/' )+1 ); |
|
310 |
if(str_replace(" ", "_", $label) == $url_label){ |
|
311 |
$uri = $res_uri; |
|
312 |
} |
|
313 |
} |
|
314 |
} |
|
315 |
} |
|
316 |
} |
|
317 |
} |
|
| 115 | 318 |
} |
319 |
return $uri; |
|
| 2 | 320 |
} |
321 |
||
322 |
/** |
|
323 |
* URLencode label for wikipedia |
|
324 |
*/ |
|
325 |
private static function urlize_for_wikipedia($label){ |
|
326 |
return urlencode(str_replace(" ", "_", $label)); |
|
327 |
} |
|
328 |
} |